From 23af5b7e18a78a45950fd5bbb000ebafcfda56f8 Mon Sep 17 00:00:00 2001 From: "akw27@arcadians.cl.cam.ac.uk" Date: Fri, 20 May 2005 14:49:37 +0000 Subject: [PATCH] bitkeeper revision 1.1473.1.1 (428df901D5uzzXaFBp8z6tkbP0gV0w) - Finer-grained asynchronous dispatch in parallax daemon. - Cleanups and cull of older code. - Fixes to handle changes in block protocol. Signed-off-by: andrew.warfield@cl.cam.ac.uk --- .rootkeys | 23 +- tools/blktap/Makefile | 51 +-- tools/blktap/blkaio.c | 19 - tools/blktap/blkaiolib.c | 489 ----------------------- tools/blktap/blkaiolib.h | 16 - tools/blktap/blkcow.c | 31 -- tools/blktap/blkcowgnbd.c | 24 -- tools/blktap/blkcowimg.c | 24 -- tools/blktap/blkcowlib.c | 380 ------------------ tools/blktap/blkcowlib.h | 14 - tools/blktap/blkdump.c | 12 - tools/blktap/blkgnbd.c | 19 - tools/blktap/blkgnbdlib.c | 471 ---------------------- tools/blktap/blkgnbdlib.h | 16 - tools/blktap/blkimg.c | 19 - tools/blktap/blkimglib.c | 325 ---------------- tools/blktap/blkimglib.h | 16 - tools/blktap/block-async.c | 404 +++++++++++++++++++ tools/blktap/block-async.h | 69 ++++ tools/blktap/blockstore-tls.c | 161 -------- tools/blktap/blockstore.c | 7 +- tools/blktap/libgnbd/Makefile | 8 - tools/blktap/libgnbd/gnbdtest.c | 90 ----- tools/blktap/libgnbd/libgnbd.c | 647 ------------------------------- tools/blktap/libgnbd/libgnbd.h | 25 -- tools/blktap/parallax-threaded.c | 34 +- tools/blktap/parallax.c | 275 ++++++++----- tools/blktap/radix.c | 417 -------------------- tools/blktap/radix.h | 10 + tools/blktap/requests-async.c | 629 ++++++++++++++++++++++++++++++ tools/blktap/requests-async.h | 19 + tools/blktap/vdi.c | 25 +- tools/blktap/vdi.h | 10 +- 33 files changed, 1375 insertions(+), 3404 deletions(-) delete mode 100644 tools/blktap/blkaio.c delete mode 100644 tools/blktap/blkaiolib.c delete mode 100644 tools/blktap/blkaiolib.h delete mode 100644 tools/blktap/blkcow.c delete mode 100644 tools/blktap/blkcowgnbd.c delete mode 100644 tools/blktap/blkcowimg.c delete mode 100644 tools/blktap/blkcowlib.c delete mode 100644 tools/blktap/blkcowlib.h delete mode 100644 tools/blktap/blkgnbd.c delete mode 100644 tools/blktap/blkgnbdlib.c delete mode 100644 tools/blktap/blkgnbdlib.h delete mode 100644 tools/blktap/blkimg.c delete mode 100644 tools/blktap/blkimglib.c delete mode 100644 tools/blktap/blkimglib.h create mode 100755 tools/blktap/block-async.c create mode 100755 tools/blktap/block-async.h delete mode 100644 tools/blktap/blockstore-tls.c delete mode 100644 tools/blktap/libgnbd/Makefile delete mode 100644 tools/blktap/libgnbd/gnbdtest.c delete mode 100644 tools/blktap/libgnbd/libgnbd.c delete mode 100644 tools/blktap/libgnbd/libgnbd.h create mode 100755 tools/blktap/requests-async.c create mode 100755 tools/blktap/requests-async.h diff --git a/.rootkeys b/.rootkeys index a4eeac0c72..67dc5d004b 100644 --- a/.rootkeys +++ b/.rootkeys @@ -477,38 +477,23 @@ 4209033eUwhDBJ_bxejiv5c6gjXS4A tools/blktap/Makefile 4209033ewLAHdhGrT_2jo3Gb_5bDcA tools/blktap/README 42277b02mYXxgijE7MFeUe9d8eldMw tools/blktap/README-PARALLAX -4209033eX_Xw94wHaOCtnU9nOAtSJA tools/blktap/blkaio.c -4209033egwf6LDxM2hbaqi9rRdZy4A tools/blktap/blkaiolib.c -4209033f9yELLK85Ipo2oKjr3ickgQ tools/blktap/blkaiolib.h -4209033fL9LcSI6LXrIp5O4axbUBLg tools/blktap/blkcow.c -4209033fUDlFGZreIyZHdP7h7yfvuQ tools/blktap/blkcowgnbd.c -4209033fCgZzLeMOwNBFmsp99x58ZQ tools/blktap/blkcowimg.c -4209033frfXH6oOi9AvRz08PPAndNA tools/blktap/blkcowlib.c -4209033fhFd_y2go9HgCF395A35xJg tools/blktap/blkcowlib.h 4209033fHgtGpb_K16_xC9CpkjNZLw tools/blktap/blkdump.c -4209033fm61CZG1RyKDW75V-eTZ9fg tools/blktap/blkgnbd.c -4209033fVfa-R6MFgGcmsQHTDna4PA tools/blktap/blkgnbdlib.c -4209033fIgDQbaHwHStHhPEDTtbqsA tools/blktap/blkgnbdlib.h -4209033figp5JRsKsXY8rw4keRumkg tools/blktap/blkimg.c -42090340V-8HKGlr00SyJGsE5jXC3A tools/blktap/blkimglib.c -42090340c7pQbh0Km8zLcEqPd_3zIg tools/blktap/blkimglib.h 42090340_mvZtozMjghPJO0qsjk4NQ tools/blktap/blkint.h 42090340rc2q1wmlGn6HtiJAkqhtNQ tools/blktap/blktaplib.c 42090340C-WkRPT7N3t-8Lzehzogdw tools/blktap/blktaplib.h -423f270cAbkh2f-DHtT0hmCtFFXVXg tools/blktap/blockstore-tls.c +428df8fdkg84W8yveE50EbkbTUZgjQ tools/blktap/block-async.c +428df8feTrgGFZEBMA_dYijy9DNs1g tools/blktap/block-async.h 42277b02WrfP1meTDPv1M5swFq8oHQ tools/blktap/blockstore.c 42277b02P1C0FYj3gqwTZUD8sxKCug tools/blktap/blockstore.h 42371b8aL1JsxAXOd4bBhmZKDyjiJg tools/blktap/blockstored.c 42371b8aD_x3L9MKsXciMNqkuk58eQ tools/blktap/bstest.c -42090340B3mDvcxvd9ehDHUkg46hvw tools/blktap/libgnbd/Makefile -42090340ZWkc5Xhf9lpQmDON8HJXww tools/blktap/libgnbd/gnbdtest.c -42090340ocMiUScJE3OpY7QNunvSbg tools/blktap/libgnbd/libgnbd.c -42090340G5_F_EeVnPORKB0pTMGGhA tools/blktap/libgnbd/libgnbd.h 423f270cbEKiTMapKnCyqkuwGvgOMA tools/blktap/parallax-threaded.c 423f270cFdXryIcD7HTPUl_Dbk4DAQ tools/blktap/parallax-threaded.h 42277b03930x2TJT3PZlw6o0GERXpw tools/blktap/parallax.c 42277b03XQYq8bujXSz7JAZ8N7j_pA tools/blktap/radix.c 42277b03vZ4-jno_mgKmAcCW3ycRAg tools/blktap/radix.h +428df8fe5RYONloDWVMkM-CfHfB1vA tools/blktap/requests-async.c +428df8feWeKJ-9HJb5_rFqdm_xqErg tools/blktap/requests-async.h 42277b03U_wLHL-alMA0bfxGlqldXg tools/blktap/snaplog.c 42277b04Ryya-z662BEx8HnxNN0dGQ tools/blktap/snaplog.h 42277b04LxFjptgZ75Z98DUAso4Prg tools/blktap/vdi.c diff --git a/tools/blktap/Makefile b/tools/blktap/Makefile index 9d794301c3..26187de779 100644 --- a/tools/blktap/Makefile +++ b/tools/blktap/Makefile @@ -22,12 +22,12 @@ PLX_SRCS := PLX_SRCS += vdi.c PLX_SRCS += radix.c PLX_SRCS += snaplog.c +PLX_SRCS += blockstore.c +PLX_SRCS += block-async.c PLXT_SRCS := $(PLX_SRCS) -#PLXT_SRCS += blockstore-tls.c -PLXT_SRCS += blockstore.c PLXT_SRCS += parallax-threaded.c -PLX_SRCS += blockstore.c VDI_SRCS := $(PLX_SRCS) +PLX_SRCS += requests-async.c PLX_SRCS += parallax.c VDI_TOOLS := @@ -55,10 +55,11 @@ CFLAGS += -Wp,-MD,.$(@F).d DEPS = .*.d OBJS = $(patsubst %.c,%.o,$(SRCS)) +IBINS = blkdump parallax $(VDI_TOOLS) LIB = libblktap.so libblktap.so.$(MAJOR) libblktap.so.$(MAJOR).$(MINOR) -all: mk-symlinks blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd $(VDI_TOOLS) parallax parallax-threaded blockstored +all: mk-symlinks blkdump $(VDI_TOOLS) parallax parallax-threaded blockstored $(MAKE) $(LIB) LINUX_ROOT := $(wildcard $(XEN_ROOT)/linux-2.6.*-xen-sparse) @@ -77,10 +78,10 @@ install: all $(INSTALL_DIR) -p $(DESTDIR)/usr/include $(INSTALL_PROG) $(LIB) $(DESTDIR)/usr/$(LIBDIR) $(INSTALL_PROG) blktaplib.h $(DESTDIR)/usr/include - $(INSTALL_PROG) blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd $(DESTDIR)/$(BLKTAP_INSTALL_DIR) + $(INSTALL_PROG) $(IBINS) $(DESTDIR)/$(BLKTAP_INSTALL_DIR) clean: - rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump blkcow blkimg blkcowimg blkgnbd blkcowgnbd blkaio $(VDI_TOOLS) parallax + rm -rf *.a *.so *.o *.rpm $(LIB) *~ $(DEPS) xen TAGS blkdump $(VDI_TOOLS) parallax parallax-threaded rpm: all rm -rf staging @@ -101,32 +102,11 @@ libblktap.so.$(MAJOR).$(MINOR): $(OBJS) blkdump: $(LIB) $(CC) $(CFLAGS) -o blkdump -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -l blktap blkdump.c -blkcowimg: $(LIB) blkcowimg.c blkcowlib.c blkimglib.c - $(CC) $(CFLAGS) -o blkcowimg -ldb -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -l blktap blkcowimg.c blkimglib.c blkcowlib.c - -blkcow: $(LIB) blkcow.c blkcowlib.c - $(CC) $(CFLAGS) -o blkcow -ldb -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -l blktap blkcow.c blkcowlib.c - -blkimg: $(LIB) blkimg.c blkimglib.c - $(CC) $(CFLAGS) -o blkimg -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -l blktap blkimg.c blkimglib.c - -blkgnbd: $(LIB) blkgnbd.c blkgnbdlib.c - $(CC) $(CFLAGS) -o blkgnbd -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap blkgnbd.c blkgnbdlib.c libgnbd/libgnbd.a - -blkcowgnbd: $(LIB) blkgnbd.c blkcowlib.c blkgnbdlib.c - $(CC) $(CFLAGS) -o blkcowgnbd -ldb -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap blkcowgnbd.c blkgnbdlib.c blkcowlib.c libgnbd/libgnbd.a - -blkaio: $(LIB) blkaio.c blkaiolib.c - $(CC) $(CFLAGS) -o blkaio -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap blkaio.c blkaiolib.c -laio -lpthread - parallax: $(LIB) $(PLX_SRCS) - $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap -lpthread $(PLX_SRCS) libgnbd/libgnbd.a + $(CC) $(CFLAGS) -o parallax -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lblktap -lpthread $(PLX_SRCS) parallax-threaded: $(LIB) $(PLXT_SRCS) - $(CC) $(CFLAGS) -o parallax-threaded -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lpthread -lblktap $(PLXT_SRCS) libgnbd/libgnbd.a - -vdi_test: $(LIB) $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o vdi_test -DVDI_STANDALONE -lpthread $(VDI_SRCS) + $(CC) $(CFLAGS) -o parallax-threaded -L$(XEN_LIBXC) -L$(XEN_LIBXUTIL) -L. -lpthread -lblktap $(PLXT_SRCS) vdi_list: $(LIB) vdi_list.c $(VDI_SRCS) $(CC) $(CFLAGS) -g3 -o vdi_list vdi_list.c -lpthread $(VDI_SRCS) @@ -163,16 +143,3 @@ TAGS: -include $(DEPS) -#Random testing targets. To be removed eventually. - -rdx_cmp: $(LIB) rdx_cmp.c $(VDI_SRCS) - $(CC) $(CFLAGS) -g3 -o rdx_cmp rdx_cmp.c $(VDI_SRCS) - -bb-tls: $(LIB) blockstore-benchmark.c - $(CC) $(CFLAGS) -o bb-tls blockstore-benchmark.c blockstore-tls.c -lpthread - -bb-trans: $(LIB) blockstore-benchmark.c - $(CC) $(CFLAGS) -o bb-trans blockstore-benchmark.c blockstore.c -lpthread - -radix-test: $(LIB) radix.c blockstore.c - $(CC) $(CFLAGS) -g3 -D RADIX_STANDALONE -o radix-test radix.c blockstore-threaded-trans.c diff --git a/tools/blktap/blkaio.c b/tools/blktap/blkaio.c deleted file mode 100644 index 25495718a4..0000000000 --- a/tools/blktap/blkaio.c +++ /dev/null @@ -1,19 +0,0 @@ -/* blkaio.c - * - * libaio-backed disk. - */ - -#include "blktaplib.h" -#include "blkaiolib.h" - - -int main(int argc, char *argv[]) -{ - aio_init(); - - blktap_register_ctrl_hook("aio_control", aio_control); - blktap_register_request_hook("aio_request", aio_request); - blktap_listen(); - - return 0; -} diff --git a/tools/blktap/blkaiolib.c b/tools/blktap/blkaiolib.c deleted file mode 100644 index 4538a9eb31..0000000000 --- a/tools/blktap/blkaiolib.c +++ /dev/null @@ -1,489 +0,0 @@ -/* blkaiolib.c - * - * file/device image-backed block device -- using linux libaio. - * - * (c) 2004 Andrew Warfield. - * - * Xend has been modified to use an amorfs:[fsid] disk tag. - * This will show up as device type (maj:240,min:0) = 61440. - * - * The fsid is placed in the sec_start field of the disk extent. - * - * NOTE: This doesn't work. Grrr. - */ - -#define _GNU_SOURCE -#define __USE_LARGEFILE64 - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "blktaplib.h" - -//#define TMP_IMAGE_FILE_NAME "/dev/sda1" -#define TMP_IMAGE_FILE_NAME "fc3.image" - -#define MAX_DOMS 1024 -#define MAX_IMGNAME_LEN 255 -#define AMORFS_DEV 61440 -#define MAX_REQUESTS 64 /* must be synced with the blkif drivers. */ -#define MAX_SEGMENTS_PER_REQ 11 -#define SECTOR_SHIFT 9 -#define MAX_AIO_REQS (MAX_REQUESTS * MAX_SEGMENTS_PER_REQ) - -#if 1 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -#if 1 -#define ASSERT(_p) \ - if ( !(_p) ) { printf("Assertion '%s' failed, line %d, file %s", #_p , \ - __LINE__, __FILE__); *(int*)0=0; } -#else -#define ASSERT(_p) ((void)0) -#endif - -char dbg_page[4096]; - -typedef struct { - /* These need to turn into an array/rbtree for multi-disk support. */ - int fd; - u64 fsid; - char imgname[MAX_IMGNAME_LEN]; - blkif_vdev_t vdevice; -} image_t; - -/* Note on pending_reqs: I assume all reqs are queued before they start to - * get filled. so count of 0 is an unused record. - */ -typedef struct { - blkif_request_t req; - int count; -} pending_req_t; - -static pending_req_t pending_list[MAX_REQUESTS]; -image_t *images[MAX_DOMS]; - -static io_context_t ctx; -static struct iocb *iocb_free[MAX_AIO_REQS]; -static int iocb_free_count; - -/* ---[ Notification mecahnism ]--------------------------------------- */ - -enum { - READ = 0, - WRITE = 1 -}; - -static int aio_notify[2]; -static volatile int aio_listening = 0; - -static struct io_event aio_events[MAX_AIO_REQS]; -static int aio_event_count = 0; - -/* this is commented out in libaio.h for some reason. */ -extern int io_queue_wait(io_context_t ctx, struct timespec *timeout); - -static void *notifier_thread(void *arg) -{ - int ret; - int msg = 0x00feeb00; - - printf("Notifier thread started.\n"); - for (;;) { - //if ((aio_listening) && ((ret = io_queue_wait(ctx, 0)) == 0)) { - if ((aio_listening) && - ((ret = io_getevents(ctx, 1, MAX_AIO_REQS, aio_events, 0)) > 0)) { - aio_event_count = ret; - printf("[Notifying! (%d)]\n", aio_event_count); - aio_listening = 0; - write(aio_notify[WRITE], &msg, sizeof(msg)); - fsync(aio_notify[WRITE]); - } else { - if (aio_listening) - printf("[io_queue_wait error! %d]\n", errno); - usleep(1000); /* Not ready to read. */ - } - } -} - -/* -------------------------------------------------------------------- */ - -int aio_control(control_msg_t *msg) -{ - domid_t domid; - DB *db; - int ret; - - if (msg->type != CMSG_BLKIF_BE) - { - printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type); - return 0; - } - - switch(msg->subtype) - { - case CMSG_BLKIF_BE_CREATE: - if ( msg->length != sizeof(blkif_be_create_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n", - ((blkif_be_create_t *)msg->msg)->domid, - ((blkif_be_create_t *)msg->msg)->blkif_handle); - domid = ((blkif_be_create_t *)msg->msg)->domid; - if (images[domid] != NULL) { - printf("attempt to connect from an existing dom!\n"); - return 0; - } - - images[domid] = (image_t *)malloc(sizeof(image_t)); - if (images[domid] == NULL) { - printf("error allocating image record.\n"); - return 0; - } - - images[domid]->fd = -1; - images[domid]->fsid = 0; - - printf("Image connected.\n"); - break; - - case CMSG_BLKIF_BE_DESTROY: - if ( msg->length != sizeof(blkif_be_destroy_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n", - ((blkif_be_destroy_t *)msg->msg)->domid, - ((blkif_be_destroy_t *)msg->msg)->blkif_handle); - - domid = ((blkif_be_destroy_t *)msg->msg)->domid; - if (images[domid] != NULL) { - if (images[domid]->fd != -1) - close( images[domid]->fd ); - free( images[domid] ); - images[domid] = NULL; - } - break; - case CMSG_BLKIF_BE_VBD_GROW: - { - blkif_be_vbd_grow_t *grow; - - if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_GROW(d:%d,h:%d,v:%d)\n", - ((blkif_be_vbd_grow_t *)msg->msg)->domid, - ((blkif_be_vbd_grow_t *)msg->msg)->blkif_handle, - ((blkif_be_vbd_grow_t *)msg->msg)->vdevice); - printf(" Extent: sec_start: %llu sec_len: %llu, dev: %d\n", - ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_start, - ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_length, - ((blkif_be_vbd_grow_t *)msg->msg)->extent.device); - grow = (blkif_be_vbd_grow_t *)msg->msg; - domid = grow->domid; - if (images[domid] == NULL) { - printf("VBD_GROW on unconnected domain!\n"); - return 0; - } - - if (grow->extent.device != AMORFS_DEV) { - printf("VBD_GROW on non-amorfs device!\n"); - return 0; - } - - /* TODO: config support for arbitrary image files/modes. */ - sprintf(images[domid]->imgname, TMP_IMAGE_FILE_NAME); - - images[domid]->fsid = grow->extent.sector_start; - images[domid]->vdevice = grow->vdevice; - images[domid]->fd = open(TMP_IMAGE_FILE_NAME, - O_RDWR | O_DIRECT | O_LARGEFILE); - if (images[domid]->fd < 0) { - printf("Couldn't open image file! %d\n", errno); - return 0; - } - - printf("Image file opened. (%s)\n", images[domid]->imgname); - break; - } - } - return 0; -parse_error: - printf("Bad control message!\n"); - return 0; - -create_failed: - /* TODO: close the db ref. */ - return 0; -} - -int aio_request(blkif_request_t *req) -{ - int fd; - u64 sector; - char *spage, *dpage; - int ret, i, idx; - blkif_response_t *rsp; - domid_t dom = ID_TO_DOM(req->id); - - if ((images[dom] == NULL) || (images[dom]->fd == -1)) { - printf("Data request for unknown domain!!! %d\n", dom); - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = req->operation; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; - } - - fd = images[dom]->fd; - - switch (req->operation) - { - case BLKIF_OP_PROBE: - { - struct stat stat; - vdisk_t *img_info; - - - /* We expect one buffer only. */ - if ( req->nr_segments != 1 ) - goto err; - - /* Make sure the buffer is page-sized. */ - if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || - (blkif_last_sect (req->frame_and_sects[0]) != 7) ) - goto err; - - /* loop for multiple images would start here. */ - - ret = fstat(fd, &stat); - if (ret != 0) { - printf("Couldn't stat image in PROBE!\n"); - goto err; - } - - img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0); - img_info[0].device = images[dom]->vdevice; - img_info[0].info = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; - img_info[0].capacity = (stat.st_size >> SECTOR_SHIFT); - - if (img_info[0].capacity == 0) - img_info[0].capacity = ((u64)1 << 63); // xend does this too. - - DPRINTF("iPROBE! device: 0x%04x capacity: %llu\n", img_info[0].device, - img_info[0].capacity); - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_PROBE; - rsp->status = 1; /* number of disks */ - - return BLKTAP_RESPOND; - } - case BLKIF_OP_WRITE: - { - unsigned long size; - struct iocb *io; - struct iocb *ioq[MAX_SEGMENTS_PER_REQ]; - - idx = ID_TO_IDX(req->id); - ASSERT(pending_list[idx].count == 0); - memcpy(&pending_list[idx].req, req, sizeof(*req)); - pending_list[idx].count = req->nr_segments; - - for (i = 0; i < req->nr_segments; i++) { - - sector = req->sector_number + (8*i); - - size = blkif_last_sect (req->frame_and_sects[i]) - - blkif_first_sect(req->frame_and_sects[i]) + 1; - - DPRINTF("iWRITE: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", - req->sector_number, sector, - blkif_first_sect(req->frame_and_sects[i]), - blkif_last_sect (req->frame_and_sects[i]), - (long)(sector << SECTOR_SHIFT)); - - spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - spage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; - - /*convert size and sector to byte offsets */ - size <<= SECTOR_SHIFT; - sector <<= SECTOR_SHIFT; - - io = iocb_free[--iocb_free_count]; - io_prep_pwrite(io, fd, spage, size, sector); - io->data = (void *)idx; - ioq[i] = io; - } - - ret = io_submit(ctx, req->nr_segments, ioq); - if (ret < 0) - printf("BADNESS: io_submit error! (%d)\n", errno); - - pending_list[idx].count = req->nr_segments; - - return BLKTAP_STOLEN; - - } - case BLKIF_OP_READ: - { - unsigned long size; - struct iocb *io; - struct iocb *ioq[MAX_SEGMENTS_PER_REQ]; - - idx = ID_TO_IDX(req->id); - ASSERT(pending_list[idx].count == 0); - memcpy(&pending_list[idx].req, req, sizeof(*req)); - pending_list[idx].count = req->nr_segments; - - for (i = 0; i < req->nr_segments; i++) { - - sector = req->sector_number + (8*i); - - size = blkif_last_sect (req->frame_and_sects[i]) - - blkif_first_sect(req->frame_and_sects[i]) + 1; - - dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - dpage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; - - - DPRINTF("iREAD : sec_nr: %10llu sec: %10llu (%1lu,%1lu) " - "pos: %15lu dpage: %p\n", - req->sector_number, sector, - blkif_first_sect(req->frame_and_sects[i]), - blkif_last_sect (req->frame_and_sects[i]), - (long)(sector << SECTOR_SHIFT), dpage); - - /*convert size and sector to byte offsets */ - size <<= SECTOR_SHIFT; - sector <<= SECTOR_SHIFT; - - io = iocb_free[--iocb_free_count]; - - io_prep_pread(io, fd, dpage, size, sector); - io->data = (void *)idx; - - ioq[i] = io; - } - - ret = io_submit(ctx, req->nr_segments, ioq); - if (ret < 0) - printf("BADNESS: io_submit error! (%d)\n", errno); - - - return BLKTAP_STOLEN; - - } - } - - printf("Unknown block operation!\n"); -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = req->operation; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; -} - - -int aio_pollhook(int fd) -{ - struct io_event *ep; - int n, ret, idx; - blkif_request_t *req; - blkif_response_t *rsp; - - DPRINTF("aio_hook(): \n"); - - for (ep = aio_events; aio_event_count-- > 0; ep++) { - struct iocb *io = ep->obj; - idx = (int) ep->data; - - if ((idx > MAX_REQUESTS-1) || (pending_list[idx].count == 0)){ - printf("gnbd returned a bad cookie (%u)!\n", idx); - break; - } - - if ((int)ep->res < 0) printf("aio request error! (%d,%d)\n", - (int)ep->res, (int)ep->res2); - - pending_list[idx].count--; - iocb_free[iocb_free_count++] = io; - - if (pending_list[idx].count == 0) { - blkif_request_t tmp = pending_list[idx].req; - rsp = (blkif_response_t *)&pending_list[idx].req; - rsp->id = tmp.id; - rsp->operation = tmp.operation; - rsp->status = BLKIF_RSP_OKAY; - blktap_inject_response(rsp); - } - } - - printf("pollhook done!\n"); - - read(aio_notify[READ], &idx, sizeof(idx)); - aio_listening = 1; - - return 0; -} - -/* the image library terminates the request stream. _resp is a noop. */ -int aio_response(blkif_response_t *rsp) -{ - return BLKTAP_PASS; -} - -void aio_init(void) -{ - int i, rc; - pthread_t p; - - for (i = 0; i < MAX_DOMS; i++) - images[i] = NULL; - - for (i = 0; i < MAX_REQUESTS; i++) - pending_list[i].count = 0; - - memset(&ctx, 0, sizeof(ctx)); - rc = io_queue_init(MAX_AIO_REQS, &ctx); - if (rc != 0) { - printf("queue_init failed! (%d)\n", rc); - exit(0); - } - - for (i=0; i -#include -#include -#include -#include "blktaplib.h" - -#define MAX_DOMS 1024 -#define MAX_DBNAME_LEN 255 -#define AMORFS_DEV 61440 -#define MAX_REQUESTS 64 /* must be synced with the blkif drivers. */ - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -/* Berkeley db has different params for open() after 4.1 */ -#ifndef DB_VERSION_MAJOR -# define DB_VERSION_MAJOR 1 -#endif /* DB_VERSION_MAJOR */ -#ifndef DB_VERSION_MINOR -# define DB_VERSION_MINOR 0 -#endif /* DB_VERSION_MINOR */ - -typedef struct { - DB *db; - u64 fsid; - char dbname[MAX_DBNAME_LEN]; -} cow_t; - -cow_t *cows[MAX_DOMS]; -blkif_request_t *reread_list[MAX_REQUESTS]; - -int cow_control(control_msg_t *msg) -{ - domid_t domid; - DB *db; - int ret; - - if (msg->type != CMSG_BLKIF_BE) - { - printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type); - return 0; - } - - switch(msg->subtype) - { - case CMSG_BLKIF_BE_CREATE: - if ( msg->length != sizeof(blkif_be_create_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n", - ((blkif_be_create_t *)msg->msg)->domid, - ((blkif_be_create_t *)msg->msg)->blkif_handle); - domid = ((blkif_be_create_t *)msg->msg)->domid; - if (cows[domid] != NULL) { - printf("attempt to connect from an existing dom!\n"); - return 0; - } - - cows[domid] = (cow_t *)malloc(sizeof(cow_t)); - if (cows[domid] == NULL) { - printf("error allocating cow.\n"); - return 0; - } - - cows[domid]->db = NULL; - cows[domid]->fsid = 0; - - printf("COW connected.\n"); - break; - - case CMSG_BLKIF_BE_DESTROY: - if ( msg->length != sizeof(blkif_be_destroy_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n", - ((blkif_be_destroy_t *)msg->msg)->domid, - ((blkif_be_destroy_t *)msg->msg)->blkif_handle); - - domid = ((blkif_be_destroy_t *)msg->msg)->domid; - if (cows[domid] != NULL) { - if (cows[domid]->db != NULL) - cows[domid]->db->close(cows[domid]->db, 0); - free(cows[domid]); - cows[domid] = NULL; - } - break; - case CMSG_BLKIF_BE_VBD_GROW: - { - blkif_be_vbd_grow_t *grow; - - if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_GROW(d:%d,h:%d,v:%d)\n", - ((blkif_be_vbd_grow_t *)msg->msg)->domid, - ((blkif_be_vbd_grow_t *)msg->msg)->blkif_handle, - ((blkif_be_vbd_grow_t *)msg->msg)->vdevice); - printf(" Extent: sec_start: %llu sec_len: %llu, dev: %d\n", - ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_start, - ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_length, - ((blkif_be_vbd_grow_t *)msg->msg)->extent.device); - grow = (blkif_be_vbd_grow_t *)msg->msg; - domid = grow->domid; - if (cows[domid] == NULL) { - printf("VBD_GROW on unconnected domain!\n"); - return 0; - } - - if (grow->extent.device != AMORFS_DEV) { - printf("VBD_GROW on non-amorfs device!\n"); - return 0; - } - - sprintf(&cows[domid]->dbname[0], "%020llu.db", - grow->extent.sector_start); - - cows[domid]->fsid = grow->extent.sector_start; - - if ((ret = db_create(&db, NULL, 0)) != 0) { - fprintf(stderr, "db_create: %s\n", db_strerror(ret)); - return 0; - } - - -#if DB_VERSION_MAJOR < 4 || (DB_VERSION_MAJOR == 4 && DB_VERSION_MINOR < 1) - - if ((ret = db->open( db, cows[domid]->dbname, NULL, DB_BTREE, - DB_CREATE, 0664)) != 0) { - -#else /* DB_VERSION >= 4.1 */ - - if ((ret = db->open( db, NULL, cows[domid]->dbname, NULL, DB_BTREE, - DB_CREATE, 0664)) != 0) { - -#endif /* DB_VERSION < 4.1 */ - - db->err(db, ret, "%s", cows[domid]->dbname); - goto create_failed; - } - cows[domid]->db = db; - printf("Overlay db opened. (%s)\n", cows[domid]->dbname); - break; - } - } - return 0; -parse_error: - printf("Bad control message!\n"); - return 0; - -create_failed: - /* TODO: close the db ref. */ - return 0; -} - -int cow_request(blkif_request_t *req) -{ - DB *db; - DBT key, data; - u64 sector; - char *spage, *dpage; - int ret, i, idx; - blkif_response_t *rsp; - domid_t dom = ID_TO_DOM(req->id); - - if ((cows[dom] == NULL) || (cows[dom]->db == NULL)) { - printf("Data request for unknown domain!!! %d\n", dom); - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = req->operation; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; - } - - db = cows[dom]->db; - - switch (req->operation) - { - case BLKIF_OP_PROBE: -/* debug -- delete */ -idx = ID_TO_IDX(req->id); -reread_list[idx] = (blkif_request_t *)malloc(sizeof(*req)); -memcpy(reread_list[idx], req, sizeof(*req)); - return BLKTAP_PASS; - - case BLKIF_OP_WRITE: - for (i = 0; i < req->nr_segments; i++) { - memset(&key, 0, sizeof(key)); - memset(&data, 0, sizeof(data)); - - sector = req->sector_number + (8*i); - key.data = §or; - key.size = sizeof(sector); - - spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - data.data = spage; - data.size = PAGE_SIZE; - - - DPRINTF("cWRITE: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", - req->sector_number, sector, - blkif_first_sect(req->frame_and_sects[i]), - blkif_last_sect (req->frame_and_sects[i]), - (long)(sector << 9)); - - if ((ret = db->put(db, NULL, &key, &data, 0)) == 0) - DPRINTF("db: %lld: key stored.\n", *((u64 *)key.data)); - else { - db->err(db, ret, "DB->put"); - goto err; - } - } - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_WRITE; - rsp->status = BLKIF_RSP_OKAY; - - return BLKTAP_RESPOND; - - case BLKIF_OP_READ: - for (i = 0; i < req->nr_segments; i++) { - memset(&key, 0, sizeof(key)); - memset(&data, 0, sizeof(data)); - - sector = req->sector_number + (8*i); - key.data = §or; - key.size = sizeof(sector); - - DPRINTF("cREAD: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", - req->sector_number, sector, - blkif_first_sect(req->frame_and_sects[i]), - blkif_last_sect (req->frame_and_sects[i]), - (long)(sector << 9)); - - if ((ret = db->get(db, NULL, &key, &data, 0)) == 0) { - DPRINTF("db: %llu: key retrieved (req).\n", - *((u64 *)key.data)); - - dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - spage = data.data; - memcpy(dpage, spage, PAGE_SIZE); - - } else if (ret == DB_NOTFOUND) { - idx = ID_TO_IDX(req->id); - if (idx > MAX_REQUESTS) { - printf("Bad index!\n"); - goto err; - } - if (reread_list[idx] != NULL) { - printf("Dupe index!\n"); - goto err; - } - reread_list[idx] = (blkif_request_t *)malloc(sizeof(*req)); - memcpy(reread_list[idx], req, sizeof(*req)); - return BLKTAP_PASS; - } else { - db->err(db, ret, "DB->get"); - goto err; - } - } - - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_READ; - rsp->status = BLKIF_RSP_OKAY; - return BLKTAP_RESPOND; - } - - printf("Unknow block operation!\n"); - return BLKTAP_PASS; -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = req->operation; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; -} - -int cow_response(blkif_response_t *rsp) -{ - blkif_request_t *req; - int i, ret; - DB *db; - DBT key, data; - u64 sector; - char *spage, *dpage; - int idx = ID_TO_IDX(rsp->id); - domid_t dom; - - /* don't touch erroring responses. */ - if (rsp->status == BLKIF_RSP_ERROR) - return BLKTAP_PASS; - - if ((rsp->operation == BLKIF_OP_READ) && (reread_list[idx] != NULL)) - { - req = reread_list[idx]; - dom = ID_TO_DOM(req->id); - - if ((cows[dom] == NULL) || (cows[dom]->db == NULL)) { - printf("Response from unknown domain!!! Very badness! %d\n", dom); - return BLKTAP_PASS; - } - - db = cows[dom]->db; - - for (i = 0; i < req->nr_segments; i++) { - memset(&key, 0, sizeof(key)); - memset(&data, 0, sizeof(data)); - - sector = req->sector_number + (8*i); - key.data = §or; - key.size = sizeof(sector); - - if ((ret = db->get(db, NULL, &key, &data, 0)) == 0) { - printf("db: %llu: key retrieved (rsp).\n", - *((u64 *)key.data)); - - dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - spage = data.data; - memcpy(dpage, spage, PAGE_SIZE); - - } else if (ret == DB_NOTFOUND) { - continue; /* We read this from disk. */ - } else { - db->err(db, ret, "DB->get"); - goto err; - } - } - free(reread_list[idx]); - reread_list[idx] = NULL; - } - - if (rsp->operation == BLKIF_OP_PROBE) { - - vdisk_t *img_info; - - req = reread_list[idx]; - img_info = (vdisk_t *)(char *)MMAP_VADDR(ID_TO_IDX(req->id), 0); - for (i =0; i < rsp->status; i++) - printf("PROBE (%d) device: 0x%04x capacity: %llu, info: 0x%04x\n", - i, - img_info[0].device, - img_info[0].capacity, - img_info[0].info); - free(reread_list[idx]); - reread_list[idx] = NULL; - } - -err: - return BLKTAP_PASS; -} - -void cow_init(void) -{ - int i; - - for (i = 0; i < MAX_DOMS; i++) - cows[i] = NULL; - - for (i = 0; i < MAX_REQUESTS; i++) - reread_list[MAX_REQUESTS] = NULL; -} - diff --git a/tools/blktap/blkcowlib.h b/tools/blktap/blkcowlib.h deleted file mode 100644 index e6bd7a5898..0000000000 --- a/tools/blktap/blkcowlib.h +++ /dev/null @@ -1,14 +0,0 @@ -/* blkcowlib.h - * - * copy on write a block device. in a really inefficient way. - * - * (c) 2004 Andrew Warfield. - * - * public interfaces to the CoW tap. - * - */ - -int cow_control (control_msg_t *msg); -int cow_request (blkif_request_t *req); -int cow_response (blkif_response_t *rsp); -void cow_init (void); diff --git a/tools/blktap/blkdump.c b/tools/blktap/blkdump.c index d64a58981a..0cf087ff02 100644 --- a/tools/blktap/blkdump.c +++ b/tools/blktap/blkdump.c @@ -62,18 +62,6 @@ int control_print(control_msg_t *msg) ((blkif_be_vbd_destroy_t *)msg->msg)->blkif_handle, ((blkif_be_vbd_destroy_t *)msg->msg)->vdevice); break; - case CMSG_BLKIF_BE_VBD_GROW: - if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_GROW(d:%d,h:%d,v:%d)\n", - ((blkif_be_vbd_grow_t *)msg->msg)->domid, - ((blkif_be_vbd_grow_t *)msg->msg)->blkif_handle, - ((blkif_be_vbd_grow_t *)msg->msg)->vdevice); - printf(" Extent: sec_start: %llu sec_len: %llu, dev: %d\n", - ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_start, - ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_length, - ((blkif_be_vbd_grow_t *)msg->msg)->extent.device); - break; default: goto parse_error; } diff --git a/tools/blktap/blkgnbd.c b/tools/blktap/blkgnbd.c deleted file mode 100644 index 6a6bd67285..0000000000 --- a/tools/blktap/blkgnbd.c +++ /dev/null @@ -1,19 +0,0 @@ -/* blkgnbd.c - * - * gnbd-backed disk. - */ - -#include "blktaplib.h" -#include "blkgnbdlib.h" - - -int main(int argc, char *argv[]) -{ - gnbd_init(); - - blktap_register_ctrl_hook("gnbd_control", gnbd_control); - blktap_register_request_hook("gnbd_request", gnbd_request); - blktap_listen(); - - return 0; -} diff --git a/tools/blktap/blkgnbdlib.c b/tools/blktap/blkgnbdlib.c deleted file mode 100644 index 6eeb49c853..0000000000 --- a/tools/blktap/blkgnbdlib.c +++ /dev/null @@ -1,471 +0,0 @@ -/* blkgnbdlib.c - * - * gnbd image-backed block device. - * - * (c) 2004 Andrew Warfield. - * - * Xend has been modified to use an amorfs:[fsid] disk tag. - * This will show up as device type (maj:240,min:0) = 61440. - * - * The fsid is placed in the sec_start field of the disk extent. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "blktaplib.h" -#include "libgnbd/libgnbd.h" - -#define GNBD_SERVER "skirmish.cl.cam.ac.uk" -#define GNBD_CLIENT "pengi-0.xeno.cl.cam.ac.uk" -#define GNBD_MOUNT "fc2_akw27" -#define GNBD_PORT 0x38e7 - -#define MAX_DOMS 1024 -#define MAX_IMGNAME_LEN 255 -#define AMORFS_DEV 61440 -#define MAX_REQUESTS 64 /* must be synced with the blkif drivers. */ -#define SECTOR_SHIFT 9 - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - -#if 1 -#define ASSERT(_p) \ - if ( !(_p) ) { printf("Assertion '%s' failed, line %d, file %s", #_p , \ - __LINE__, __FILE__); *(int*)0=0; } -#else -#define ASSERT(_p) ((void)0) -#endif - -#define GH_DISCONNECTED 0 -#define GH_PROBEWAITING 1 -#define GH_CONNECTED 2 - -typedef struct { - /* These need to turn into an array/rbtree for multi-disk support. */ - struct gnbd_handle *gh; - int gh_state; - int probe_idx; /* This really needs cleaning up after hotos. */ - int fd; - u64 fsid; - char gnbdname[MAX_IMGNAME_LEN]; - blkif_vdev_t vdevice; -} gnbd_t; - -/* Note on pending_reqs: I assume all reqs are queued before they start to - * get filled. so count of 0 is an unused record. - */ -typedef struct { - blkif_request_t req; - int count; -} pending_req_t; - -static gnbd_t *gnbds[MAX_DOMS]; -static pending_req_t pending_list[MAX_REQUESTS]; -static int pending_count = 0; /* debugging */ - - -gnbd_t *get_gnbd_by_fd(int fd) -{ - /* this is a linear scan for the moment. nees to be cleaned up for - multi-disk support. */ - - int i; - - for (i=0; i< MAX_DOMS; i++) - if ((gnbds[i] != NULL) && (gnbds[i]->fd == fd)) - return gnbds[i]; - - return NULL; -} - -int gnbd_pollhook(int fd); - -int gnbd_control(control_msg_t *msg) -{ - domid_t domid; - DB *db; - int ret; - - if (msg->type != CMSG_BLKIF_BE) - { - printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type); - return 0; - } - - switch(msg->subtype) - { - case CMSG_BLKIF_BE_CREATE: - if ( msg->length != sizeof(blkif_be_create_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n", - ((blkif_be_create_t *)msg->msg)->domid, - ((blkif_be_create_t *)msg->msg)->blkif_handle); - domid = ((blkif_be_create_t *)msg->msg)->domid; - if (gnbds[domid] != NULL) { - printf("attempt to connect from an existing dom!\n"); - return 0; - } - - gnbds[domid] = (gnbd_t *)malloc(sizeof(gnbd_t)); - if (gnbds[domid] == NULL) { - printf("error allocating gnbd record.\n"); - return 0; - } - - gnbds[domid]->gh = NULL; - gnbds[domid]->fsid = 0; - - break; - - case CMSG_BLKIF_BE_DESTROY: - if ( msg->length != sizeof(blkif_be_destroy_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n", - ((blkif_be_destroy_t *)msg->msg)->domid, - ((blkif_be_destroy_t *)msg->msg)->blkif_handle); - - domid = ((blkif_be_destroy_t *)msg->msg)->domid; - if (gnbds[domid] != NULL) { - if (gnbds[domid]->gh != NULL) { - blktap_detach_poll(gnbds[domid]->fd); - free(gnbds[domid]->gh); /* XXX: Need a gnbd close call! */; - } - free( gnbds[domid] ); - gnbds[domid] = NULL; - } - break; - case CMSG_BLKIF_BE_VBD_GROW: - { - blkif_be_vbd_grow_t *grow; - - if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_GROW(d:%d,h:%d,v:%d)\n", - ((blkif_be_vbd_grow_t *)msg->msg)->domid, - ((blkif_be_vbd_grow_t *)msg->msg)->blkif_handle, - ((blkif_be_vbd_grow_t *)msg->msg)->vdevice); - printf(" Extent: sec_start: %llu sec_len: %llu, dev: %d\n", - ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_start, - ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_length, - ((blkif_be_vbd_grow_t *)msg->msg)->extent.device); - grow = (blkif_be_vbd_grow_t *)msg->msg; - domid = grow->domid; - if (gnbds[domid] == NULL) { - printf("VBD_GROW on unconnected domain!\n"); - return 0; - } - - if (grow->extent.device != AMORFS_DEV) { - printf("VBD_GROW on non-amorfs device!\n"); - return 0; - } - - /* TODO: config support for arbitrary gnbd files/modes. */ - sprintf(gnbds[domid]->gnbdname, GNBD_MOUNT); - - gnbds[domid]->fsid = grow->extent.sector_start; - gnbds[domid]->vdevice = grow->vdevice; - gnbds[domid]->gh_state = GH_DISCONNECTED; - gnbds[domid]->gh = gnbd_setup(GNBD_SERVER, GNBD_PORT, - gnbds[domid]->gnbdname, GNBD_CLIENT); - if (gnbds[domid]->gh == NULL) { - printf("Couldn't connect to gnbd mount!!\n"); - return 0; - } - gnbds[domid]->fd = gnbd_fd(gnbds[domid]->gh); - blktap_attach_poll(gnbds[domid]->fd, POLLIN, gnbd_pollhook); - - printf("gnbd mount connected. (%s)\n", gnbds[domid]->gnbdname); - break; - } - } - return 0; -parse_error: - printf("Bad control message!\n"); - return 0; - -create_failed: - /* TODO: close the db ref. */ - return 0; -} - -static int gnbd_blkif_probe(blkif_request_t *req, gnbd_t *gnbd) -{ - int fd; - struct stat stat; - vdisk_t *gnbd_info; - blkif_response_t *rsp; - - /* We expect one buffer only. */ - if ( req->nr_segments != 1 ) - goto err; - - /* Make sure the buffer is page-sized. */ - if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || - (blkif_last_sect (req->frame_and_sects[0]) != 7) ) - goto err; - - /* loop for multiple gnbds would start here. */ - - gnbd_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0); - gnbd_info[0].device = gnbd->vdevice; - gnbd_info[0].info = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; - gnbd_info[0].capacity = gnbd_sectors(gnbd->gh); - - printf("[SECTORS] %llu", gnbd_info[0].capacity); - - //if (gnbd_info[0].capacity == 0) - // gnbd_info[0].capacity = ((u64)1 << 63); // xend does this too. - - DPRINTF("iPROBE! device: 0x%04x capacity: %llu\n", gnbd_info[0].device, - gnbd_info[0].capacity); - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_PROBE; - rsp->status = 1; /* number of disks */ - - return BLKTAP_RESPOND; -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = req->operation; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; -} - -int gnbd_request(blkif_request_t *req) -{ - struct gnbd_handle *gh; - u64 sector; - char *spage, *dpage; - int ret, i, idx; - blkif_response_t *rsp; - domid_t dom = ID_TO_DOM(req->id); - - if ((gnbds[dom] == NULL) || (gnbds[dom]->gh == NULL)) { - printf("Data request for unknown domain!!! %d\n", dom); - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = req->operation; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; - } - - gh = gnbds[dom]->gh; - - switch (req->operation) - { - case BLKIF_OP_PROBE: - { - printf("PROBE!\n"); - if ( gnbds[dom]->gh_state == GH_PROBEWAITING ) { - printf("Already have a PROBE outstanding!\n"); - goto err; - } - - if ( gnbds[dom]->gh_state == GH_DISCONNECTED ) - { - /* need to defer until we are connected. */ - printf("Deferring PROBE!\n"); - idx = ID_TO_IDX(req->id); - memcpy(&pending_list[idx].req, req, sizeof(*req)); - ASSERT(pending_list[idx].count == 0); - pending_list[idx].count = 1; - - gnbds[dom]->probe_idx = idx; - gnbds[dom]->gh_state = GH_PROBEWAITING; - - return BLKTAP_STOLEN; - } - - - return gnbd_blkif_probe(req, gnbds[dom]); - } - case BLKIF_OP_WRITE: - { - unsigned long size; - - idx = ID_TO_IDX(req->id); - ASSERT(pending_list[idx].count == 0); - memcpy(&pending_list[idx].req, req, sizeof(*req)); - pending_list[idx].count = req->nr_segments; - pending_count++; /* dbg */ - - for (i = 0; i < req->nr_segments; i++) { - - sector = req->sector_number + (8*i); - - size = blkif_last_sect (req->frame_and_sects[i]) - - blkif_first_sect(req->frame_and_sects[i]) + 1; - - DPRINTF("iWRITE: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", - req->sector_number, sector, - blkif_first_sect(req->frame_and_sects[i]), - blkif_last_sect (req->frame_and_sects[i]), - (long)(sector << SECTOR_SHIFT)); - - spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - spage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; - - ret = gnbd_write(gh, sector, size, spage, (unsigned long)idx); - if (ret) { - printf("gnbd error on WRITE\n"); - goto err; - } - } -//printf("[WR] < %lu\n", (unsigned long)idx); - - return BLKTAP_STOLEN; - } - case BLKIF_OP_READ: - { - unsigned long size; - - idx = ID_TO_IDX(req->id); - ASSERT(pending_list[idx].count == 0); - memcpy(&pending_list[idx].req, req, sizeof(*req)); - pending_list[idx].count = req->nr_segments; - pending_count++; /* dbg */ - - for (i = 0; i < req->nr_segments; i++) { - - sector = req->sector_number + (8*i); - - size = blkif_last_sect (req->frame_and_sects[i]) - - blkif_first_sect(req->frame_and_sects[i]) + 1; - - DPRINTF("iREAD : sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", - req->sector_number, sector, - blkif_first_sect(req->frame_and_sects[i]), - blkif_last_sect (req->frame_and_sects[i]), - (long)(sector << SECTOR_SHIFT)); - - dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - dpage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; - - ret = gnbd_read(gh, sector, size, dpage, (unsigned long)idx); - if (ret) { - printf("gnbd error on READ\n"); - goto err; - } - - } -//printf("[RD] < %lu\n", (unsigned long)idx); - - return BLKTAP_STOLEN; - } - } - - printf("Unknown block operation!\n"); -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = req->operation; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; -} - -/* the gnbd library terminates the request stream. _resp is a noop. */ -int gnbd_response(blkif_response_t *rsp) -{ - return BLKTAP_PASS; -} - -int gnbd_pollhook(int fd) -{ - int err; - struct gnbd_handle *gh; - blkif_request_t *req; - blkif_response_t *rsp; - unsigned long idx; - - gnbd_t *gnbd = get_gnbd_by_fd(fd); - - if (gnbd == NULL) { - printf("GNBD badness: got poll hook on unknown device. (%d)\n", fd); - return -1; - } - gh = gnbd->gh; - err = gnbd_reply(gh); - switch (err) { - case GNBD_LOGIN_DONE: - if (gnbd->gh_state == GH_PROBEWAITING) { - req = (blkif_request_t *)&pending_list[gnbd->probe_idx].req; - printf("[!] Sending deferred PROBE!\n"); - gnbd_blkif_probe(req, gnbd); - pending_list[gnbd->probe_idx].count = 0; - rsp = (blkif_response_t *)req; - blktap_inject_response(rsp); - } - gnbd->gh_state = GH_CONNECTED; - printf("GNBD_LOGIN_DONE (%d)\n", fd); - break; - - case GNBD_REQUEST_DONE: /* switch to idx */ - idx = gnbd_finished_request(gh); - req = (blkif_request_t *)&pending_list[idx].req; - if ((idx > MAX_REQUESTS-1) || (pending_list[idx].count == 0)){ - printf("gnbd returned a bad cookie (%lu)!\n", idx); - break; - } - - pending_list[idx].count--; - - if (pending_list[idx].count == 0) { - blkif_request_t tmp = *req; - pending_count--; /* dbg */ - rsp = (blkif_response_t *)req; - rsp->id = tmp.id; - rsp->operation = tmp.operation; - rsp->status = BLKIF_RSP_OKAY; - blktap_inject_response(rsp); -/* -if (rsp->operation == BLKIF_OP_READ) { -printf("[RD] > %lu (%d pndg)\n", (unsigned long)idx, pending_count); -} else if (rsp->operation == BLKIF_OP_WRITE) { -printf("[WR] > %lu (%d pndg)\n", (unsigned long)idx, pending_count); -} else { -printf("[??] > %lu (%d pndg)\n", (unsigned long)idx, pending_count); -} -*/ - } - break; - - case GNBD_CONTINUE: - break; - - case 0: - break; - - default: - printf("gnbd_reply error"); - break; - } - return 0; -} - -void gnbd_init(void) -{ - int i; - - for (i = 0; i < MAX_DOMS; i++) - gnbds[i] = NULL; - - for (i = 0; i < MAX_REQUESTS; i++) - pending_list[i].count = 0; - - printf("GNBD image plugin initialized\n"); -} - diff --git a/tools/blktap/blkgnbdlib.h b/tools/blktap/blkgnbdlib.h deleted file mode 100644 index b95d2409ac..0000000000 --- a/tools/blktap/blkgnbdlib.h +++ /dev/null @@ -1,16 +0,0 @@ -/* blkgnbdlib.h - * - * gndb image-backed block device. - * - * (c) 2004 Andrew Warfield. - * - * Xend has been modified to use an amorfs:[fsid] disk tag. - * This will show up as device type (maj:240,min:0) = 61440. - * - * The fsid is placed in the sec_start field of the disk extent. - */ - -int gnbd_control(control_msg_t *msg); -int gnbd_request(blkif_request_t *req); -int gnbd_response(blkif_response_t *rsp); /* noop */ -void gnbd_init(void); diff --git a/tools/blktap/blkimg.c b/tools/blktap/blkimg.c deleted file mode 100644 index fc746add4b..0000000000 --- a/tools/blktap/blkimg.c +++ /dev/null @@ -1,19 +0,0 @@ -/* blkimg.c - * - * file-backed disk. - */ - -#include "blktaplib.h" -#include "blkimglib.h" - - -int main(int argc, char *argv[]) -{ - image_init(); - - blktap_register_ctrl_hook("image_control", image_control); - blktap_register_request_hook("image_request", image_request); - blktap_listen(); - - return 0; -} diff --git a/tools/blktap/blkimglib.c b/tools/blktap/blkimglib.c deleted file mode 100644 index 075a2d962d..0000000000 --- a/tools/blktap/blkimglib.c +++ /dev/null @@ -1,325 +0,0 @@ -/* blkimglib.c - * - * file image-backed block device. - * - * (c) 2004 Andrew Warfield. - * - * Xend has been modified to use an amorfs:[fsid] disk tag. - * This will show up as device type (maj:240,min:0) = 61440. - * - * The fsid is placed in the sec_start field of the disk extent. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "blktaplib.h" - -//#define TMP_IMAGE_FILE_NAME "/dev/sda1" -#define TMP_IMAGE_FILE_NAME "fc3.image" - -#define MAX_DOMS 1024 -#define MAX_IMGNAME_LEN 255 -#define AMORFS_DEV 61440 -#define MAX_REQUESTS 64 /* must be synced with the blkif drivers. */ -#define SECTOR_SHIFT 9 - -#if 0 -#define DPRINTF(_f, _a...) printf ( _f , ## _a ) -#else -#define DPRINTF(_f, _a...) ((void)0) -#endif - - -typedef struct { - /* These need to turn into an array/rbtree for multi-disk support. */ - FILE *img; - u64 fsid; - char imgname[MAX_IMGNAME_LEN]; - blkif_vdev_t vdevice; -} image_t; - -image_t *images[MAX_DOMS]; -blkif_request_t *reread_list[MAX_REQUESTS]; - -int image_control(control_msg_t *msg) -{ - domid_t domid; - DB *db; - int ret; - - if (msg->type != CMSG_BLKIF_BE) - { - printf("***\nUNEXPECTED CTRL MSG MAJOR TYPE(%d)\n***\n", msg->type); - return 0; - } - - switch(msg->subtype) - { - case CMSG_BLKIF_BE_CREATE: - if ( msg->length != sizeof(blkif_be_create_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_CREATE(d:%d,h:%d)\n", - ((blkif_be_create_t *)msg->msg)->domid, - ((blkif_be_create_t *)msg->msg)->blkif_handle); - domid = ((blkif_be_create_t *)msg->msg)->domid; - if (images[domid] != NULL) { - printf("attempt to connect from an existing dom!\n"); - return 0; - } - - images[domid] = (image_t *)malloc(sizeof(image_t)); - if (images[domid] == NULL) { - printf("error allocating image record.\n"); - return 0; - } - - images[domid]->img = NULL; - images[domid]->fsid = 0; - - printf("Image connected.\n"); - break; - - case CMSG_BLKIF_BE_DESTROY: - if ( msg->length != sizeof(blkif_be_destroy_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_DESTROY(d:%d,h:%d)\n", - ((blkif_be_destroy_t *)msg->msg)->domid, - ((blkif_be_destroy_t *)msg->msg)->blkif_handle); - - domid = ((blkif_be_destroy_t *)msg->msg)->domid; - if (images[domid] != NULL) { - if (images[domid]->img != NULL) - fclose( images[domid]->img ); - free( images[domid] ); - images[domid] = NULL; - } - break; - case CMSG_BLKIF_BE_VBD_GROW: - { - blkif_be_vbd_grow_t *grow; - - if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) - goto parse_error; - printf("[CONTROL_MSG] CMSG_BLKIF_BE_VBD_GROW(d:%d,h:%d,v:%d)\n", - ((blkif_be_vbd_grow_t *)msg->msg)->domid, - ((blkif_be_vbd_grow_t *)msg->msg)->blkif_handle, - ((blkif_be_vbd_grow_t *)msg->msg)->vdevice); - printf(" Extent: sec_start: %llu sec_len: %llu, dev: %d\n", - ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_start, - ((blkif_be_vbd_grow_t *)msg->msg)->extent.sector_length, - ((blkif_be_vbd_grow_t *)msg->msg)->extent.device); - grow = (blkif_be_vbd_grow_t *)msg->msg; - domid = grow->domid; - if (images[domid] == NULL) { - printf("VBD_GROW on unconnected domain!\n"); - return 0; - } - - if (grow->extent.device != AMORFS_DEV) { - printf("VBD_GROW on non-amorfs device!\n"); - return 0; - } - - /* TODO: config support for arbitrary image files/modes. */ - sprintf(images[domid]->imgname, TMP_IMAGE_FILE_NAME); - - images[domid]->fsid = grow->extent.sector_start; - images[domid]->vdevice = grow->vdevice; - images[domid]->img = fopen64(TMP_IMAGE_FILE_NAME, "r+"); - if (images[domid]->img == NULL) { - printf("Couldn't open image file!\n"); - return 0; - } - - printf("Image file opened. (%s)\n", images[domid]->imgname); - break; - } - } - return 0; -parse_error: - printf("Bad control message!\n"); - return 0; - -create_failed: - /* TODO: close the db ref. */ - return 0; -} - -int image_request(blkif_request_t *req) -{ - FILE *img; - u64 sector; - char *spage, *dpage; - int ret, i, idx; - blkif_response_t *rsp; - domid_t dom = ID_TO_DOM(req->id); - - if ((images[dom] == NULL) || (images[dom]->img == NULL)) { - printf("Data request for unknown domain!!! %d\n", dom); - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = req->operation; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; - } - - img = images[dom]->img; - - switch (req->operation) - { - case BLKIF_OP_PROBE: - { - int fd; - struct stat stat; - vdisk_t *img_info; - - - /* We expect one buffer only. */ - if ( req->nr_segments != 1 ) - goto err; - - /* Make sure the buffer is page-sized. */ - if ( (blkif_first_sect(req->frame_and_sects[0]) != 0) || - (blkif_last_sect (req->frame_and_sects[0]) != 7) ) - goto err; - - /* loop for multiple images would start here. */ - - fd = fileno(img); - if (fd == -1) { - printf("Couldn't get image fd in PROBE!\n"); - goto err; - } - - ret = fstat(fd, &stat); - if (ret != 0) { - printf("Couldn't stat image in PROBE!\n"); - goto err; - } - - img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0); - img_info[0].device = images[dom]->vdevice; - img_info[0].info = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; - img_info[0].capacity = (stat.st_size >> SECTOR_SHIFT); - - if (img_info[0].capacity == 0) - img_info[0].capacity = ((u64)1 << 63); // xend does this too. - - DPRINTF("iPROBE! device: 0x%04x capacity: %llu\n", img_info[0].device, - img_info[0].capacity); - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_PROBE; - rsp->status = 1; /* number of disks */ - - return BLKTAP_RESPOND; - } - case BLKIF_OP_WRITE: - { - unsigned long size; - - for (i = 0; i < req->nr_segments; i++) { - - sector = req->sector_number + (8*i); - - size = blkif_last_sect (req->frame_and_sects[i]) - - blkif_first_sect(req->frame_and_sects[i]) + 1; - - ret = fseeko64(img, (off_t)(sector << SECTOR_SHIFT), SEEK_SET); - if (ret != 0) { - printf("fseek error on WRITE\n"); - goto err; - } - - DPRINTF("iWRITE: sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", - req->sector_number, sector, - blkif_first_sect(req->frame_and_sects[i]), - blkif_last_sect (req->frame_and_sects[i]), - (long)(sector << SECTOR_SHIFT)); - - spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - spage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; - ret = fwrite(spage, size << SECTOR_SHIFT, 1, img); - if (ret != 1) { - printf("fwrite error on WRITE (%d)\n", errno); - goto err; - } - } - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_WRITE; - rsp->status = BLKIF_RSP_OKAY; - - return BLKTAP_RESPOND; - } - case BLKIF_OP_READ: - { - unsigned long size; - - for (i = 0; i < req->nr_segments; i++) { - - sector = req->sector_number + (8*i); - - size = blkif_last_sect (req->frame_and_sects[i]) - - blkif_first_sect(req->frame_and_sects[i]) + 1; - - ret = fseeko64(img, (off_t)(sector << SECTOR_SHIFT), SEEK_SET); - if (ret != 0) { - printf("fseek error on READ\n"); - goto err; - } - - DPRINTF("iREAD : sec_nr: %10llu sec: %10llu (%1lu,%1lu) pos: %15lu\n", - req->sector_number, sector, - blkif_first_sect(req->frame_and_sects[i]), - blkif_last_sect (req->frame_and_sects[i]), - (long)(sector << SECTOR_SHIFT)); - - dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - dpage += blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; - ret = fread(dpage, size << SECTOR_SHIFT, 1, img); - if (ret != 1) { - printf("fread error on READ\n"); - goto err; - } - } - - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_READ; - rsp->status = BLKIF_RSP_OKAY; - return BLKTAP_RESPOND; - } - } - - printf("Unknow block operation!\n"); -err: - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = req->operation; - rsp->status = BLKIF_RSP_ERROR; - return BLKTAP_RESPOND; -} - -/* the image library terminates the request stream. _resp is a noop. */ -int image_response(blkif_response_t *rsp) -{ - return BLKTAP_PASS; -} - -void image_init(void) -{ - int i; - - for (i = 0; i < MAX_DOMS; i++) - images[i] = NULL; -} - diff --git a/tools/blktap/blkimglib.h b/tools/blktap/blkimglib.h deleted file mode 100644 index 1bc597f233..0000000000 --- a/tools/blktap/blkimglib.h +++ /dev/null @@ -1,16 +0,0 @@ -/* blkimglib.h - * - * file image-backed block device. - * - * (c) 2004 Andrew Warfield. - * - * Xend has been modified to use an amorfs:[fsid] disk tag. - * This will show up as device type (maj:240,min:0) = 61440. - * - * The fsid is placed in the sec_start field of the disk extent. - */ - -int image_control(control_msg_t *msg); -int image_request(blkif_request_t *req); -int image_response(blkif_response_t *rsp); /* noop */ -void image_init(void); diff --git a/tools/blktap/block-async.c b/tools/blktap/block-async.c new file mode 100755 index 0000000000..6f26071ade --- /dev/null +++ b/tools/blktap/block-async.c @@ -0,0 +1,404 @@ +/* block-async.c + * + * Asynchronous block wrappers for parallax. + */ + + +#include +#include +#include +#include +#include "block-async.h" +#include "blockstore.h" +#include "vdi.h" + + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + +/* We have a queue of outstanding I/O requests implemented as a + * circular producer-consumer ring with free-running buffers. + * to allow reordering, this ring indirects to indexes in an + * ring of io_structs. + * + * the block_* calls may either add an entry to this ring and return, + * or satisfy the request immediately and call the callback directly. + * None of the io calls in parallax should be nested enough to worry + * about stack problems with this approach. + */ + +struct read_args { + u64 addr; +}; + +struct write_args { + u64 addr; + char *block; +}; + +struct alloc_args { + char *block; +}; + +struct pending_io_req { + enum {IO_READ, IO_WRITE, IO_ALLOC, IO_RWAKE, IO_WWAKE} op; + union { + struct read_args r; + struct write_args w; + struct alloc_args a; + } u; + io_cb_t cb; + void *param; +}; + +void radix_lock_init(struct radix_lock *r) +{ + int i; + + pthread_mutex_init(&r->lock, NULL); + for (i=0; i < 1024; i++) { + r->lines[i] = 0; + r->waiters[i] = NULL; + r->state[i] = ANY; + } +} + +/* maximum outstanding I/O requests issued asynchronously */ +/* must be a power of 2.*/ +#define MAX_PENDING_IO 1024 //1024 + +/* how many threads to concurrently issue I/O to the disk. */ +#define IO_POOL_SIZE 10 //10 + +static struct pending_io_req pending_io_reqs[MAX_PENDING_IO]; +static int pending_io_list[MAX_PENDING_IO]; +static unsigned long io_prod = 0, io_cons = 0, io_free = 0; +#define PENDING_IO_MASK(_x) ((_x) & (MAX_PENDING_IO - 1)) +#define PENDING_IO_IDX(_x) ((_x) - pending_io_reqs) +#define PENDING_IO_ENT(_x) \ + (&pending_io_reqs[pending_io_list[PENDING_IO_MASK(_x)]]) +#define CAN_PRODUCE_PENDING_IO ((io_free + MAX_PENDING_IO) != io_prod) +#define CAN_CONSUME_PENDING_IO (io_cons != io_prod) +static pthread_mutex_t pending_io_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t pending_io_cond = PTHREAD_COND_INITIALIZER; + +static void init_pending_io(void) +{ + int i; + + for (i=0; iop = IO_READ; + req->u.r.addr = addr; + req->cb = cb; + req->param = param; + + pthread_cond_signal(&pending_io_cond); + pthread_mutex_unlock(&pending_io_lock); +} + + +void block_write(u64 addr, char *block, io_cb_t cb, void *param) +{ + struct pending_io_req *req; + + pthread_mutex_lock(&pending_io_lock); + assert(CAN_PRODUCE_PENDING_IO); + + req = PENDING_IO_ENT(io_prod++); + DPRINTF("Produce (W) %lu (%p)\n", io_prod - 1, req); + req->op = IO_WRITE; + req->u.w.addr = addr; + req->u.w.block = block; + req->cb = cb; + req->param = param; + + pthread_cond_signal(&pending_io_cond); + pthread_mutex_unlock(&pending_io_lock); +} + + +void block_alloc(char *block, io_cb_t cb, void *param) +{ + struct pending_io_req *req; + + pthread_mutex_lock(&pending_io_lock); + assert(CAN_PRODUCE_PENDING_IO); + + req = PENDING_IO_ENT(io_prod++); + req->op = IO_ALLOC; + req->u.a.block = block; + req->cb = cb; + req->param = param; + + pthread_cond_signal(&pending_io_cond); + pthread_mutex_unlock(&pending_io_lock); +} + +void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param) +{ + struct io_ret ret; + pthread_mutex_lock(&r->lock); + + if (( r->lines[row] >= 0 ) && (r->state[row] != STOP)) { + r->lines[row]++; + r->state[row] = READ; + DPRINTF("RLOCK : %3d (row: %d)\n", r->lines[row], row); + pthread_mutex_unlock(&r->lock); + ret.type = IO_INT_T; + ret.u.i = 0; + cb(ret, param); + } else { + struct radix_wait **rwc; + struct radix_wait *rw = + (struct radix_wait *) malloc (sizeof(struct radix_wait)); + DPRINTF("RLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row); + rw->type = RLOCK; + rw->param = param; + rw->cb = cb; + rw->next = NULL; + /* append to waiters list. */ + rwc = &r->waiters[row]; + while (*rwc != NULL) rwc = &(*rwc)->next; + *rwc = rw; + pthread_mutex_unlock(&r->lock); + return; + } +} + + +void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param) +{ + struct io_ret ret; + pthread_mutex_lock(&r->lock); + + /* the second check here is redundant -- just here for debugging now. */ + if ((r->state[row] == ANY) && ( r->lines[row] == 0 )) { + r->state[row] = STOP; + r->lines[row] = -1; + DPRINTF("WLOCK : %3d (row: %d)\n", r->lines[row], row); + pthread_mutex_unlock(&r->lock); + ret.type = IO_INT_T; + ret.u.i = 0; + cb(ret, param); + } else { + struct radix_wait **rwc; + struct radix_wait *rw = + (struct radix_wait *) malloc (sizeof(struct radix_wait)); + DPRINTF("WLOCK : %3d (row: %d) -- DEFERRED!\n", r->lines[row], row); + rw->type = WLOCK; + rw->param = param; + rw->cb = cb; + rw->next = NULL; + /* append to waiters list. */ + rwc = &r->waiters[row]; + while (*rwc != NULL) rwc = &(*rwc)->next; + *rwc = rw; + pthread_mutex_unlock(&r->lock); + return; + } + +} + +/* called with radix_lock locked and lock count of zero. */ +static void wake_waiters(struct radix_lock *r, int row) +{ + struct pending_io_req *req; + struct radix_wait *rw; + + DPRINTF("prewake\n"); + if (r->lines[row] != 0) return; + if (r->waiters[row] == NULL) {DPRINTF("nowaiters!\n");return;} + + DPRINTF("wake\n"); + if (r->waiters[row]->type == WLOCK) { + rw = r->waiters[row]; + pthread_mutex_lock(&pending_io_lock); + assert(CAN_PRODUCE_PENDING_IO); + + req = PENDING_IO_ENT(io_prod++); + DPRINTF("Produce (WWAKE) %lu (%p)\n", io_prod - 1, req); + req->op = IO_WWAKE; + req->cb = rw->cb; + req->param = rw->param; + r->lines[row] = -1; /* write lock the row. */ + r->state[row] = STOP; + r->waiters[row] = rw->next; + free(rw); + pthread_mutex_unlock(&pending_io_lock); + } else /* RLOCK */ { + while ((r->waiters[row] != NULL) && (r->waiters[row]->type == RLOCK)) { + rw = r->waiters[row]; + pthread_mutex_lock(&pending_io_lock); + assert(CAN_PRODUCE_PENDING_IO); + + req = PENDING_IO_ENT(io_prod++); + DPRINTF("Produce (RWAKE) %lu (%p)\n", io_prod - 1, req); + req->op = IO_RWAKE; + req->cb = rw->cb; + req->param = rw->param; + r->lines[row]++; /* read lock the row. */ + r->state[row] = READ; + r->waiters[row] = rw->next; + free(rw); + pthread_mutex_unlock(&pending_io_lock); + } + if (r->waiters[row] != NULL) /* There is a write queued still */ + r->state[row] = STOP; + } + + DPRINTF("wakedone\n"); + DPRINTF("prod: %lu cons: %lu free: %lu\n", io_prod, io_cons, io_free); + pthread_mutex_lock(&pending_io_lock); + pthread_cond_signal(&pending_io_cond); + pthread_mutex_unlock(&pending_io_lock); +} + +void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param) +{ + struct io_ret ret; + + pthread_mutex_lock(&r->lock); + assert(r->lines[row] > 0); /* try to catch misuse. */ + r->lines[row]--; + DPRINTF("RUNLOCK: %3d (row: %d)\n", r->lines[row], row); + if (r->lines[row] == 0) { + r->state[row] = ANY; + wake_waiters(r, row); + } + pthread_mutex_unlock(&r->lock); + cb(ret, param); +} + +void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param) +{ + struct io_ret ret; + + pthread_mutex_lock(&r->lock); + assert(r->lines[row] == -1); /* try to catch misuse. */ + r->lines[row] = 0; + r->state[row] = ANY; + DPRINTF("WUNLOCK: %3d (row: %d)\n", r->lines[row], row); + wake_waiters(r, row); + pthread_mutex_unlock(&r->lock); + cb(ret, param); +} + +/* consumer calls */ +static void do_next_io_req(struct pending_io_req *req) +{ + struct io_ret ret; + void *param; + + switch (req->op) { + case IO_READ: + ret.type = IO_BLOCK_T; + ret.u.b = readblock(req->u.r.addr); + break; + case IO_WRITE: + ret.type = IO_INT_T; + ret.u.i = writeblock(req->u.w.addr, req->u.w.block); + DPRINTF("wrote %d at %Lu\n", *(int *)(req->u.w.block), req->u.w.addr); + break; + case IO_ALLOC: + ret.type = IO_ADDR_T; + ret.u.a = allocblock(req->u.a.block); + break; + case IO_RWAKE: + DPRINTF("WAKE DEFERRED RLOCK!\n"); + ret.type = IO_INT_T; + ret.u.i = 0; + break; + case IO_WWAKE: + DPRINTF("WAKE DEFERRED WLOCK!\n"); + ret.type = IO_INT_T; + ret.u.i = 0; + break; + default: + DPRINTF("Unknown IO operation on pending list!\n"); + return; + } + + param = req->param; + DPRINTF("freeing idx %d to slot %lu.\n", PENDING_IO_IDX(req), PENDING_IO_MASK(io_free)); + pthread_mutex_lock(&pending_io_lock); + pending_io_list[PENDING_IO_MASK(io_free++)] = PENDING_IO_IDX(req); + DPRINTF(" : prod: %lu cons: %lu free: %lu\n", io_prod, io_cons, io_free); + pthread_mutex_unlock(&pending_io_lock); + + assert(req->cb != NULL); + req->cb(ret, param); + +} + +void *io_thread(void *param) +{ + int tid; + struct pending_io_req *req; + + /* Set this thread's tid. */ + tid = *(int *)param; + free(param); + + DPRINTF("IOT %2d started.\n", tid); + +start: + pthread_mutex_lock(&pending_io_lock); + while (io_prod == io_cons) { + pthread_cond_wait(&pending_io_cond, &pending_io_lock); + } + + if (io_prod == io_cons) { + /* unnecessary wakeup. */ + pthread_mutex_unlock(&pending_io_lock); + goto start; + } + + req = PENDING_IO_ENT(io_cons++); + DPRINTF("IOT %2d has req %04d(%p).\n", tid, PENDING_IO_IDX(req), req); + DPRINTF(" : prod: %lu cons: %lu free: %lu\n", io_prod, io_cons, io_free); + pthread_mutex_unlock(&pending_io_lock); + + + do_next_io_req(req); + + goto start; + +} + +static pthread_t io_pool[IO_POOL_SIZE]; +void start_io_threads(void) + +{ + int i, tid=0; + + for (i=0; i < IO_POOL_SIZE; i++) { + int ret, *t; + t = (int *)malloc(sizeof(int)); + *t = tid++; + ret = pthread_create(&io_pool[i], NULL, io_thread, t); + if (ret != 0) printf("Error starting thread %d\n", i); + } + +} + +void init_block_async(void) +{ + init_pending_io(); + start_io_threads(); +} diff --git a/tools/blktap/block-async.h b/tools/blktap/block-async.h new file mode 100755 index 0000000000..b19d464a52 --- /dev/null +++ b/tools/blktap/block-async.h @@ -0,0 +1,69 @@ +/* block-async.h + * + * Asynchronous block wrappers for parallax. + */ + +#ifndef _BLOCKASYNC_H_ +#define _BLOCKASYNC_H_ + +#include +#include +#include "vdi.h" + +struct io_ret +{ + enum {IO_ADDR_T, IO_BLOCK_T, IO_INT_T} type; + union { + u64 a; + char *b; + int i; + } u; +}; + +typedef void (*io_cb_t)(struct io_ret r, void *param); + +/* per-vdi lock structures to make sure requests run in a safe order. */ +struct radix_wait { + enum {RLOCK, WLOCK} type; + io_cb_t cb; + void *param; + struct radix_wait *next; +}; + +struct radix_lock { + pthread_mutex_t lock; + int lines[1024]; + struct radix_wait *waiters[1024]; + enum {ANY, READ, STOP} state[1024]; +}; +void radix_lock_init(struct radix_lock *r); + +void block_read(u64 addr, io_cb_t cb, void *param); +void block_write(u64 addr, char *block, io_cb_t cb, void *param); +void block_alloc(char *block, io_cb_t cb, void *param); +void block_rlock(struct radix_lock *r, int row, io_cb_t cb, void *param); +void block_wlock(struct radix_lock *r, int row, io_cb_t cb, void *param); +void block_runlock(struct radix_lock *r, int row, io_cb_t cb, void *param); +void block_wunlock(struct radix_lock *r, int row, io_cb_t cb, void *param); +void init_block_async(void); + +static inline u64 IO_ADDR(struct io_ret r) +{ + assert(r.type == IO_ADDR_T); + return r.u.a; +} + +static inline char *IO_BLOCK(struct io_ret r) +{ + assert(r.type == IO_BLOCK_T); + return r.u.b; +} + +static inline int IO_INT(struct io_ret r) +{ + assert(r.type == IO_INT_T); + return r.u.i; +} + + +#endif //_BLOCKASYNC_H_ diff --git a/tools/blktap/blockstore-tls.c b/tools/blktap/blockstore-tls.c deleted file mode 100644 index 67808d7c28..0000000000 --- a/tools/blktap/blockstore-tls.c +++ /dev/null @@ -1,161 +0,0 @@ -/************************************************************************** - * - * blockstore.c - * - * Simple block store interface - * - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "blockstore.h" -#include "parallax-threaded.h" - -/*static int block_fp = -1;*/ - -static int fd_list[READ_POOL_SIZE+1]; - -/** - * readblock: read a block from disk - * @id: block id to read - * - * @return: pointer to block, NULL on error - */ - -void *readblock(u64 id) -{ - void *block; - int tid = (int)pthread_getspecific(tid_key); - - if (lseek64(fd_list[tid], ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { - printf ("%Ld\n", (id - 1) * BLOCK_SIZE); - perror("readblock lseek"); - goto err; - } - if ((block = malloc(BLOCK_SIZE)) == NULL) { - perror("readblock malloc"); - goto err; - } - if (read(fd_list[tid], block, BLOCK_SIZE) != BLOCK_SIZE) { - perror("readblock read"); - free(block); - goto err; - } - return block; - -err: - return NULL; -} - -/** - * writeblock: write an existing block to disk - * @id: block id - * @block: pointer to block - * - * @return: zero on success, -1 on failure - */ -int writeblock(u64 id, void *block) -{ - int tid = (int)pthread_getspecific(tid_key); - - if (lseek64(fd_list[tid], ((off64_t) id - 1LL) * BLOCK_SIZE, SEEK_SET) < 0) { - perror("writeblock lseek"); - goto err; - } - if (write(fd_list[tid], block, BLOCK_SIZE) < 0) { - perror("writeblock write"); - goto err; - } - return 0; - -err: - return -1; -} - -/** - * allocblock: write a new block to disk - * @block: pointer to block - * - * @return: new id of block on disk - */ - -u64 allocblock(void *block) -{ - u64 lb; - off64_t pos; - int tid = (int)pthread_getspecific(tid_key); - - pos = lseek64(fd_list[tid], 0, SEEK_END); - if (pos == (off64_t)-1) { - perror("allocblock lseek"); - goto err; - } - if (pos % BLOCK_SIZE != 0) { - fprintf(stderr, "file size not multiple of %d\n", BLOCK_SIZE); - goto err; - } - if (write(fd_list[tid], block, BLOCK_SIZE) != BLOCK_SIZE) { - perror("allocblock write"); - goto err; - } - lb = pos / BLOCK_SIZE + 1; - - return lb; - -err: - return 0; - -} - - -/** - * newblock: get a new in-memory block set to zeros - * - * @return: pointer to new block, NULL on error - */ -void *newblock() -{ - void *block = malloc(BLOCK_SIZE); - if (block == NULL) { - perror("newblock"); - return NULL; - } - memset(block, 0, BLOCK_SIZE); - return block; -} - - -/** - * freeblock: unallocate an in-memory block - * @id: block id (zero if this is only in-memory) - * @block: block to be freed - */ -void freeblock(void *block) -{ - if (block != NULL) - free(block); -} - - -int __init_blockstore(void) -{ - int i; - - for (i=0; i<(READ_POOL_SIZE+1); i++) { - - fd_list[i] = open("blockstore.dat", - O_RDWR | O_CREAT | O_LARGEFILE, 0644); - - if (fd_list[i] < 0) { - perror("open"); - return -1; - } - } - return 0; -} diff --git a/tools/blktap/blockstore.c b/tools/blktap/blockstore.c index 36903fe09e..a9dde6e461 100644 --- a/tools/blktap/blockstore.c +++ b/tools/blktap/blockstore.c @@ -19,7 +19,7 @@ #include #include "parallax-threaded.h" -#define BLOCKSTORE_REMOTE +//#define BLOCKSTORE_REMOTE //#define BSDEBUG #define RETRY_TIMEOUT 1000000 /* microseconds */ @@ -942,7 +942,8 @@ u64 allocblock_hint(void *block, u64 hint) { void *readblock(u64 id) { void *block; int block_fp; - + +//printf("readblock(%llu)\n", id); block_fp = open("blockstore.dat", O_RDONLY | O_CREAT | O_LARGEFILE, 0644); if (block_fp < 0) { @@ -1336,6 +1337,7 @@ int __init_blockstore(void) void __exit_blockstore(void) { int i; +#ifdef BLOCKSTORE_REMOTE pthread_mutex_destroy(&ptmutex_recv); pthread_mutex_destroy(&ptmutex_luid); pthread_mutex_destroy(&ptmutex_queue); @@ -1345,4 +1347,5 @@ void __exit_blockstore(void) pthread_mutex_destroy(&(pool_thread[i].ptmutex)); pthread_cond_destroy(&(pool_thread[i].ptcv)); } +#endif } diff --git a/tools/blktap/libgnbd/Makefile b/tools/blktap/libgnbd/Makefile deleted file mode 100644 index 4297c02148..0000000000 --- a/tools/blktap/libgnbd/Makefile +++ /dev/null @@ -1,8 +0,0 @@ - -CFLAGS += -Wall -Werror -g -LDFLAGS += -g - -libgnbd.a: libgnbd.o - $(AR) r $@ $< - -gnbdtest: gnbdtest.o libgnbd.a diff --git a/tools/blktap/libgnbd/gnbdtest.c b/tools/blktap/libgnbd/gnbdtest.c deleted file mode 100644 index bc391591b0..0000000000 --- a/tools/blktap/libgnbd/gnbdtest.c +++ /dev/null @@ -1,90 +0,0 @@ - -#include -#include -#include -#include -#include -#include - -#include - -#include "libgnbd.h" - -#define PRINTF(x) printf x -#if 0 -#define DFPRINTF(x...) fprintf(stderr, ##x) -#define DPRINTF(x) DFPRINTF x -#else -#define DPRINTF(x) -#endif - -static unsigned char buf1[8 << 9]; -static unsigned char buf2[8 << 9]; -static unsigned char buf3[8 << 9]; - -int -main(int argc, char **argv) -{ - struct gnbd_handle *gh; - struct pollfd pfd[1]; - int err, tout; - - gh = gnbd_setup("panik", 0x38e7, "cl349-nahant-beta2-root1", - "arcadians.cl.cam.ac.uk"); - if (gh == NULL) - errx(1, "gnbd_setup"); - - memset(pfd, 0, sizeof(pfd)); - pfd[0].fd = gnbd_fd(gh); - pfd[0].events = POLLIN; - - while ((tout = poll(pfd, 1, 0)) >= 0) { - if (tout == 0) - continue; - DPRINTF(("event\n")); - if (pfd[0].revents) { - err = gnbd_reply(gh); - pfd[0].events = POLLIN; - switch (err) { - case GNBD_LOGIN_DONE: - DPRINTF(("sectors: %08llu\n", - gnbd_sectors(gh))); - err = gnbd_read(gh, 8, 8, buf2, 1); - if (err) - warnx("gnbd_read"); - err = gnbd_read(gh, 0, 8, buf1, 0); - if (err) - warnx("gnbd_read"); - err = gnbd_read(gh, 16, 8, buf3, 2); - if (err) - warnx("gnbd_read"); - break; - case GNBD_REQUEST_DONE: - DPRINTF(("request done %ld\n", - gnbd_finished_request(gh))); - if (0 && gnbd_finished_request(gh) == 0) { - write(1, buf1, 8 << 9); - err = gnbd_write(gh, 0, 8, buf1, 10); - if (err) - warnx("gnbd_write"); - } - break; - case GNBD_CONTINUE: - DPRINTF(("continue\n")); - break; - case 0: - break; - case GNBD_CONTINUE_WRITE: - DPRINTF(("continue write\n")); - pfd[0].events |= POLLOUT; - break; - default: - warnx("gnbd_reply error"); - break; - } - DPRINTF(("got gnbd reply\n")); - } - } - - return 0; -} diff --git a/tools/blktap/libgnbd/libgnbd.c b/tools/blktap/libgnbd/libgnbd.c deleted file mode 100644 index 2856ca311d..0000000000 --- a/tools/blktap/libgnbd/libgnbd.c +++ /dev/null @@ -1,647 +0,0 @@ -/* libgnbd.c - * - * gnbd client library - * - * Copyright (c) 2005, Christian Limpach - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -#include "libgnbd.h" - -#define PROTOCOL_VERSION 2 - -#define EXTERN_KILL_GSERV_REQ 5 -#define EXTERN_LOGIN_REQ 6 - -#define GNBD_REQUEST_MAGIC 0x37a07e00 -#define GNBD_KEEP_ALIVE_MAGIC 0x5b46d8c2 -#define GNBD_REPLY_MAGIC 0x41f09370 - -enum { - GNBD_CMD_READ = 0, - GNBD_CMD_WRITE = 1, - GNBD_CMD_DISC = 2, - GNBD_CMD_PING = 3 -}; - -#if __BYTE_ORDER == __BIG_ENDIAN -#define htonll(x) (x) -#define ntohll(x) (x) -#endif -#if __BYTE_ORDER == __LITTLE_ENDIAN -#define htonll(x) bswap_64(x) -#define ntohll(x) bswap_64(x) -#endif - -#define PRINTF(x) printf x -#if 0 -#define DFPRINTF(x...) fprintf(stderr, ##x) -#define DPRINTF(x) DFPRINTF x -#else -#define DPRINTF(x) -#endif - -struct gnbd_request { - struct gnbd_request *gr_next; - unsigned char *gr_buf; - ssize_t gr_size; - ssize_t gr_done; - unsigned long gr_cookie; -}; - -struct gnbd_handle { - int gh_fd; - unsigned int gh_flags; - uint64_t gh_sectors; - char gh_devname[32]; - char gh_nodename[65]; - struct sockaddr_in gh_sin; - struct gnbd_request *gh_outstanding_requests; - struct gnbd_request **gh_outstanding_requests_last; - struct gnbd_request *gh_incoming_request; - unsigned long gh_finished_request; -}; -#define GHF_EXPECT_KILL_GSERV_REPLY 0x0001 -#define GHF_EXPECT_LOGIN_REPLY 0x0002 -#define GHF_INCOMING_REQUEST 0x0004 - -struct device_req { - char name[32]; -}; - -struct node_req { - char node_name[65]; -}; - -struct login_req { - uint64_t timestamp; - uint16_t version; - uint8_t pad[6]; - char devname[32]; -}; - -struct login_reply { - uint64_t sectors; - uint16_t version; - uint8_t err; - uint8_t pad[5]; -}; - -struct gnbd_server_request { - uint32_t magic; - uint32_t type; - char handle[8]; - uint64_t from; - uint32_t len; -} __attribute__ ((packed)); - -struct gnbd_server_reply { - uint32_t magic; - uint32_t error; - char handle[8]; -} __attribute__ ((packed)); - -static int -read_buf(int fd, void *buf, size_t count, size_t *read_count) -{ - int err; - - err = read(fd, buf, count); - if (read_count) { - if (err >= 0) - *read_count = err; - } else if (err != count) - return EINTR; /* xxx */ - return err < 0; -} - -static int -read_4(int fd, unsigned long *val) -{ - unsigned long buf; - int err; - - err = read_buf(fd, &buf, sizeof(buf), NULL); - if (err == 0) - *val = ntohl(buf); - return err; -} - -static int -write_buf(int fd, void *buf, size_t count) -{ - int err; - - err = write(fd, buf, count); - return err < 0; -} - -static int -write_4(int fd, unsigned long val) -{ - unsigned long buf; - int err; - - buf = htonl(val); - err = write_buf(fd, &buf, sizeof(buf)); - return err; -} - - -static int -socket_connect(struct gnbd_handle *gh) -{ - int err; - - if (gh->gh_fd >= 0) - return 0; - - gh->gh_fd = socket(PF_INET, SOCK_STREAM, 0); - if (gh->gh_fd < 0) { - warn("socket"); - return gh->gh_fd; - } - - err = connect(gh->gh_fd, (struct sockaddr *)&gh->gh_sin, - sizeof(gh->gh_sin)); - if (err) { - warn("connect"); - goto out; - } - - return 0; - out: - close (gh->gh_fd); - gh->gh_fd = -1; - return err; -} - -static int -socket_shutdown(struct gnbd_handle *gh) -{ - - close (gh->gh_fd); - gh->gh_fd = -1; - return 0; -} - -static int -find_request(struct gnbd_handle *gh, struct gnbd_request *gr) -{ - struct gnbd_request **tmp; - - for (tmp = &gh->gh_outstanding_requests; *tmp; - tmp = &(*tmp)->gr_next) { - if (*tmp == gr) { - *tmp = (*tmp)->gr_next; - if (*tmp == NULL) - gh->gh_outstanding_requests_last = tmp; - return 0; - } - } - return ENOENT; -} - -static int -kill_gserv(struct gnbd_handle *gh) -{ - struct device_req dr; - struct node_req nr; - int err; - - DPRINTF(("gnbd_kill_gserv\n")); - err = socket_connect(gh); - if (err) { - warnx("socket_connect"); - return err; - } - - err = write_4(gh->gh_fd, EXTERN_KILL_GSERV_REQ); - if (err) { - warnx("send EXTERN_LOGIN_REQ failed"); - goto out; - } - - strncpy(dr.name, gh->gh_devname, sizeof(dr.name)); - err = write_buf(gh->gh_fd, &dr, sizeof(dr)); - if (err) { - warnx("send device_req failed"); - goto out; - } - - strncpy(nr.node_name, gh->gh_nodename, sizeof(nr.node_name)); - err = write_buf(gh->gh_fd, &nr, sizeof(nr)); - if (err) { - warnx("send node_req failed"); - goto out; - } - - gh->gh_flags |= GHF_EXPECT_KILL_GSERV_REPLY; - DPRINTF(("gnbd_kill_gserv ok\n")); - - return 0; - out: - socket_shutdown(gh); - return err; -} - -static int -login(struct gnbd_handle *gh) -{ - struct login_req lr; - struct node_req nr; - int err; - uint64_t timestamp; - struct timeval tv; - - DPRINTF(("gnbd_login\n")); - err = socket_connect(gh); - if (err) { - warnx("socket_connect"); - return err; - } - - err = write_4(gh->gh_fd, EXTERN_LOGIN_REQ); - if (err) { - warnx("send EXTERN_LOGIN_REQ failed"); - goto out; - } - - err = gettimeofday(&tv, NULL); - if (err) { - warnx("gettimeofday"); - goto out; - } - timestamp = (uint64_t)tv.tv_sec * 1000000 + tv.tv_usec; - - lr.timestamp = htonll(timestamp); - lr.version = htons(PROTOCOL_VERSION); - strncpy(lr.devname, gh->gh_devname, sizeof(lr.devname)); - err = write_buf(gh->gh_fd, &lr, sizeof(lr)); - if (err) { - warnx("send login_req failed"); - goto out; - } - - strncpy(nr.node_name, gh->gh_nodename, sizeof(nr.node_name)); - err = write_buf(gh->gh_fd, &nr, sizeof(nr)); - if (err) { - warnx("send node_req failed"); - goto out; - } - - gh->gh_flags |= GHF_EXPECT_LOGIN_REPLY; - - DPRINTF(("gnbd_login ok\n")); - return 0; - out: - socket_shutdown(gh); - return err; -} - -static int -kill_gserv_reply(struct gnbd_handle *gh) -{ - unsigned long reply; - int err; - - DPRINTF(("read gnbd_kill_gserv_reply\n")); - err = read_4(gh->gh_fd, &reply); - if (err) { - warnx("read kill_gserv_reply failed"); - return err; - } - - if (reply && reply != ENODEV) { - warnx("kill gserv failed: %s", strerror(reply)); - return reply; - } - - gh->gh_flags &= ~GHF_EXPECT_KILL_GSERV_REPLY; - socket_shutdown(gh); - - err = login(gh); - if (err) - warnx("gnbd_login"); - - return err; -} - -static int -login_reply(struct gnbd_handle *gh) -{ - struct login_reply lr; - int err; - - DPRINTF(("read gnbd_login_reply\n")); - err = read_buf(gh->gh_fd, &lr, sizeof(lr), NULL); - if (err) { - warnx("read login_reply failed"); - return err; - } - - if (lr.err) { - if (lr.version) { - warnx("gnbd version mismatch %04x != %04x", - PROTOCOL_VERSION, ntohs(lr.version)); - return EINVAL; - } - warnx("login refused: %s", strerror(lr.err)); - return lr.err; - } - gh->gh_sectors = ntohll(lr.sectors); - - gh->gh_flags &= ~GHF_EXPECT_LOGIN_REPLY; - - return GNBD_LOGIN_DONE; -} - -static int -incoming_request(struct gnbd_handle *gh) -{ - struct gnbd_request *gr = gh->gh_incoming_request; - ssize_t done; - int err; - - DPRINTF(("incoming_request: done %d size %d\n", gr->gr_done, - gr->gr_size)); - err = read_buf(gh->gh_fd, gr->gr_buf + gr->gr_done, - gr->gr_size - gr->gr_done, &done); - if (err) - goto out; - - DPRINTF(("incoming_request: got %d\n", done)); - gr->gr_done += done; - if (gr->gr_done == gr->gr_size) { - gh->gh_flags &= ~GHF_INCOMING_REQUEST; - gh->gh_finished_request = gr->gr_cookie; - free(gr); - return GNBD_REQUEST_DONE; - } - - return GNBD_CONTINUE; - - out: - gh->gh_flags &= ~GHF_INCOMING_REQUEST; - gh->gh_finished_request = 0; - free(gr); - return err; -} - - - -int -gnbd_close(struct gnbd_handle *gh) -{ - int err; - struct gnbd_request **tmp; - - for (tmp = &gh->gh_outstanding_requests; *tmp; tmp = &(*tmp)->gr_next) - free(*tmp); - - if (gh->gh_flags & GHF_INCOMING_REQUEST) - free(gh->gh_incoming_request); - - err = close(gh->gh_fd); - if (err) - warnx("close"); - free(gh); - - return err; -} - -int -gnbd_fd(struct gnbd_handle *gh) -{ - return gh->gh_fd; -} - -unsigned long -gnbd_finished_request(struct gnbd_handle *gh) -{ - return gh->gh_finished_request; -} - -int -gnbd_read(struct gnbd_handle *gh, uint64_t sector, ssize_t count, - unsigned char *buf, unsigned long cookie) -{ - struct gnbd_server_request gsr; - struct gnbd_request *gr; - int err; - - gr = malloc(sizeof(struct gnbd_request)); - if (gr == NULL) - return ENOMEM; - memset(gr, 0, sizeof(gr)); - - gr->gr_buf = buf; - gr->gr_size = count << 9; - gr->gr_done = 0; - gr->gr_cookie = cookie; - - gsr.magic = htonl(GNBD_REQUEST_MAGIC); - gsr.type = htonl(GNBD_CMD_READ); - gsr.from = htonll(sector << 9); - gsr.len = htonl(gr->gr_size); - memset(gsr.handle, 0, sizeof(gsr.handle)); - memcpy(gsr.handle, &gr, sizeof(gr)); - - err = write_buf(gh->gh_fd, &gsr, sizeof(gsr)); - if (err) { - warnx("write_buf"); - goto out; - } - - *gh->gh_outstanding_requests_last = gr; - gh->gh_outstanding_requests_last = &gr->gr_next; - - return 0; - - out: - free(gr); - return err; -} - -int -gnbd_write(struct gnbd_handle *gh, uint64_t sector, ssize_t count, - unsigned char *buf, unsigned long cookie) -{ - struct gnbd_server_request gsr; - struct gnbd_request *gr; - int err; - - gr = malloc(sizeof(struct gnbd_request)); - if (gr == NULL) - return ENOMEM; - memset(gr, 0, sizeof(gr)); - - gr->gr_buf = buf; - gr->gr_size = count << 9; - gr->gr_done = 0; - gr->gr_cookie = cookie; - - gsr.magic = htonl(GNBD_REQUEST_MAGIC); - gsr.type = htonl(GNBD_CMD_WRITE); - gsr.from = htonll(sector << 9); - gsr.len = htonl(gr->gr_size); - memset(gsr.handle, 0, sizeof(gsr.handle)); - memcpy(gsr.handle, &gr, sizeof(gr)); - - err = write_buf(gh->gh_fd, &gsr, sizeof(gsr)); - if (err) { - warnx("write_buf"); - goto out; - } - - /* XXX handle non-blocking socket */ - err = write_buf(gh->gh_fd, buf, gr->gr_size); - if (err) { - warnx("write_buf"); - goto out; - } - gr->gr_done += gr->gr_size; - - *gh->gh_outstanding_requests_last = gr; - gh->gh_outstanding_requests_last = &gr->gr_next; - - DPRINTF(("write done\n")); - - return 0; - - out: - free(gr); - return err; -} - -int -gnbd_reply(struct gnbd_handle *gh) -{ - struct gnbd_server_reply gsr; - struct gnbd_request *gr; - int err; - - DPRINTF(("gnbd_reply flags %x\n", gh->gh_flags)); - if ((gh->gh_flags & GHF_EXPECT_KILL_GSERV_REPLY)) - return kill_gserv_reply(gh); - if ((gh->gh_flags & GHF_EXPECT_LOGIN_REPLY)) - return login_reply(gh); - if ((gh->gh_flags & GHF_INCOMING_REQUEST)) - return incoming_request(gh); - - DPRINTF(("read response\n")); - err = read_buf(gh->gh_fd, &gsr, sizeof(gsr), NULL); - if (err) { - warnx("read gnbd_reply failed"); - return err; - } - - if (ntohl(gsr.error)) { - warnx("gnbd server reply error: %s", strerror(gsr.error)); - return gsr.error; - } - - switch (ntohl(gsr.magic)) { - case GNBD_KEEP_ALIVE_MAGIC: - DPRINTF(("read keep alive magic\n")); - return GNBD_CONTINUE; - case GNBD_REPLY_MAGIC: - DPRINTF(("read reply magic\n")); - memcpy(&gr, gsr.handle, sizeof(gr)); - err = find_request(gh, gr); - if (err) { - warnx("unknown request"); - return err; - } - if (gr->gr_done != gr->gr_size) { - gh->gh_incoming_request = gr; - gh->gh_flags |= GHF_INCOMING_REQUEST; - return GNBD_CONTINUE; - } else { - gh->gh_finished_request = gr->gr_cookie; - free(gr); - return GNBD_REQUEST_DONE; - } - default: - break; - } - - return GNBD_CONTINUE; -} - -uint64_t -gnbd_sectors(struct gnbd_handle *gh) -{ - - return gh->gh_sectors; -} - -struct gnbd_handle * -gnbd_setup(char *server, unsigned int port, char *devname, char *nodename) -{ - struct gnbd_handle *gh; - struct addrinfo *res, *ai; - int err; - - gh = malloc(sizeof(struct gnbd_handle)); - if (gh == NULL) - return NULL; - memset(gh, 0, sizeof(gh)); - gh->gh_fd = -1; - gh->gh_outstanding_requests_last = &gh->gh_outstanding_requests; - - strncpy(gh->gh_devname, devname, sizeof(gh->gh_devname)); - strncpy(gh->gh_nodename, nodename, sizeof(gh->gh_nodename)); - - err = getaddrinfo(server, NULL, NULL, &res); - if (err) { - if (err != EAI_SYSTEM) - warnx("getaddrinfo: %s", gai_strerror(err)); - else - warn("getaddrinfo: %s", gai_strerror(err)); - goto out; - } - - for (ai = res; ai; ai = ai->ai_next) { - if (ai->ai_socktype != SOCK_STREAM) - continue; - if (ai->ai_family == AF_INET) - break; - } - - if (ai == NULL) - goto out; - - gh->gh_sin.sin_family = ai->ai_family; - gh->gh_sin.sin_port = htons(port); - memcpy(&gh->gh_sin.sin_addr, - &((struct sockaddr_in *)ai->ai_addr)->sin_addr, - sizeof(gh->gh_sin.sin_addr)); - - err = kill_gserv(gh); - if (err) { - warnx("gnbd_kill_gserv"); - goto out; - } - - freeaddrinfo(res); - return gh; - out: - free(gh); - freeaddrinfo(res); - return NULL; -} diff --git a/tools/blktap/libgnbd/libgnbd.h b/tools/blktap/libgnbd/libgnbd.h deleted file mode 100644 index 9fb3dbbd5f..0000000000 --- a/tools/blktap/libgnbd/libgnbd.h +++ /dev/null @@ -1,25 +0,0 @@ -/* libgnbd.h - * - * gnbd client library - * - * Copyright (c) 2005, Christian Limpach - */ - -#define GNBD_LOGIN_DONE 0x10001 -#define GNBD_REQUEST_DONE 0x10002 -#define GNBD_CONTINUE 0x10003 -#define GNBD_CONTINUE_WRITE 0x10004 - -struct gnbd_handle; -int gnbd_close(struct gnbd_handle *); -int gnbd_fd(struct gnbd_handle *); -unsigned long gnbd_finished_request(struct gnbd_handle *); -int gnbd_kill_gserv(struct gnbd_handle *); -int gnbd_login(struct gnbd_handle *); -int gnbd_read(struct gnbd_handle *, uint64_t, ssize_t, unsigned char *, - unsigned long); -int gnbd_write(struct gnbd_handle *, uint64_t, ssize_t, unsigned char *, - unsigned long); -int gnbd_reply(struct gnbd_handle *); -uint64_t gnbd_sectors(struct gnbd_handle *); -struct gnbd_handle *gnbd_setup(char *, unsigned int, char *, char *); diff --git a/tools/blktap/parallax-threaded.c b/tools/blktap/parallax-threaded.c index 25b80dea16..47e469b111 100644 --- a/tools/blktap/parallax-threaded.c +++ b/tools/blktap/parallax-threaded.c @@ -145,33 +145,33 @@ void blkif_destroy(blkif_be_destroy_t *destroy) destroy->status = BLKIF_BE_STATUS_OKAY; } -void vbd_grow(blkif_be_vbd_grow_t *grow) +void vbd_create(blkif_be_vbd_create_t *create) { blkif_t *blkif; vdi_t *vdi, **vdip; - blkif_vdev_t vdevice = grow->vdevice; + blkif_vdev_t vdevice = create->vdevice; - DPRINTF("parallax (vbd_grow): grow=%p\n", grow); + DPRINTF("parallax (vbd_create): create=%p\n", create); - blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle); + blkif = blkif_find_by_handle(create->domid, create->blkif_handle); if ( blkif == NULL ) { - DPRINTF("vbd_grow attempted for non-existent blkif (%u,%u)\n", - grow->domid, grow->blkif_handle); - grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", + create->domid, create->blkif_handle); + create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; return; } /* VDI identifier is in grow->extent.sector_start */ - DPRINTF("vbd_grow: grow->extent.sector_start (id) is %llx\n", - grow->extent.sector_start); + DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", + (unsigned long)create->dev_handle); - vdi = vdi_get(grow->extent.sector_start); + vdi = vdi_get(create->dev_handle); if (vdi == NULL) { - printf("parallax (vbd_grow): VDI %llx not found.\n", - grow->extent.sector_start); - grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + printf("parallax (vbd_create): VDI %lx not found.\n", + (unsigned long)create->dev_handle); + create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; return; } @@ -183,7 +183,7 @@ void vbd_grow(blkif_be_vbd_grow_t *grow) *vdip = vdi; DPRINTF("vbd_grow: happy return!\n"); - grow->status = BLKIF_BE_STATUS_OKAY; + create->status = BLKIF_BE_STATUS_OKAY; } int parallax_control(control_msg_t *msg) @@ -213,10 +213,10 @@ int parallax_control(control_msg_t *msg) blkif_destroy((blkif_be_destroy_t *)msg->msg); break; - case CMSG_BLKIF_BE_VBD_GROW: - if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) + case CMSG_BLKIF_BE_VBD_CREATE: + if ( msg->length != sizeof(blkif_be_vbd_create_t) ) goto parse_error; - vbd_grow((blkif_be_vbd_grow_t *)msg->msg); + vbd_create((blkif_be_vbd_create_t *)msg->msg); break; } return 0; diff --git a/tools/blktap/parallax.c b/tools/blktap/parallax.c index 9c853bc035..ee47957ada 100644 --- a/tools/blktap/parallax.c +++ b/tools/blktap/parallax.c @@ -10,11 +10,16 @@ #include #include #include +#include #include "blktaplib.h" #include "blockstore.h" #include "vdi.h" +#include "block-async.h" +#include "requests-async.h" #define PARALLAX_DEV 61440 +#define SECTS_PER_NODE 8 + #if 0 #define DPRINTF(_f, _a...) printf ( _f , ## _a ) @@ -142,33 +147,33 @@ void blkif_destroy(blkif_be_destroy_t *destroy) destroy->status = BLKIF_BE_STATUS_OKAY; } -void vbd_grow(blkif_be_vbd_grow_t *grow) +void vbd_create(blkif_be_vbd_create_t *create) { blkif_t *blkif; vdi_t *vdi, **vdip; - blkif_vdev_t vdevice = grow->vdevice; + blkif_vdev_t vdevice = create->vdevice; - DPRINTF("parallax (vbd_grow): grow=%p\n", grow); + DPRINTF("parallax (vbd_create): create=%p\n", create); - blkif = blkif_find_by_handle(grow->domid, grow->blkif_handle); + blkif = blkif_find_by_handle(create->domid, create->blkif_handle); if ( blkif == NULL ) { - DPRINTF("vbd_grow attempted for non-existent blkif (%u,%u)\n", - grow->domid, grow->blkif_handle); - grow->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; + DPRINTF("vbd_create attempted for non-existent blkif (%u,%u)\n", + create->domid, create->blkif_handle); + create->status = BLKIF_BE_STATUS_INTERFACE_NOT_FOUND; return; } /* VDI identifier is in grow->extent.sector_start */ - DPRINTF("vbd_grow: grow->extent.sector_start (id) is %llx\n", - grow->extent.sector_start); + DPRINTF("vbd_create: create->dev_handle (id) is %lx\n", + (unsigned long)create->dev_handle); - vdi = vdi_get(grow->extent.sector_start); + vdi = vdi_get(create->dev_handle); if (vdi == NULL) { - printf("parallax (vbd_grow): VDI %llx not found.\n", - grow->extent.sector_start); - grow->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; + printf("parallax (vbd_create): VDI %lx not found.\n", + (unsigned long)create->dev_handle); + create->status = BLKIF_BE_STATUS_VBD_NOT_FOUND; return; } @@ -180,7 +185,7 @@ void vbd_grow(blkif_be_vbd_grow_t *grow) *vdip = vdi; DPRINTF("vbd_grow: happy return!\n"); - grow->status = BLKIF_BE_STATUS_OKAY; + create->status = BLKIF_BE_STATUS_OKAY; } int parallax_control(control_msg_t *msg) @@ -210,10 +215,10 @@ int parallax_control(control_msg_t *msg) blkif_destroy((blkif_be_destroy_t *)msg->msg); break; - case CMSG_BLKIF_BE_VBD_GROW: - if ( msg->length != sizeof(blkif_be_vbd_grow_t) ) + case CMSG_BLKIF_BE_VBD_CREATE: + if ( msg->length != sizeof(blkif_be_vbd_create_t) ) goto parse_error; - vbd_grow((blkif_be_vbd_grow_t *)msg->msg); + vbd_create((blkif_be_vbd_create_t *)msg->msg); break; } return 0; @@ -248,9 +253,9 @@ int parallax_probe(blkif_request_t *req, blkif_t *blkif) img_info = (vdisk_t *)MMAP_VADDR(ID_TO_IDX(req->id), 0); img_info[nr_vdis].device = vdi->vdevice; img_info[nr_vdis].info = VDISK_TYPE_DISK | VDISK_FLAG_VIRT; - /* The -2 here accounts for the LSB in the radix tree */ + /* The -1 here accounts for the LSB in the radix tree */ img_info[nr_vdis].capacity = - ((1LL << (VDI_HEIGHT-2)) >> SECTOR_SHIFT); + ((1LL << (VDI_HEIGHT-1)) * SECTS_PER_NODE); nr_vdis++; vdi = vdi->next; } @@ -274,78 +279,122 @@ err: return BLKTAP_RESPOND; } +typedef struct { + blkif_request_t *req; + int count; + int error; + pthread_mutex_t mutex; +} pending_t; + +#define MAX_REQUESTS 64 +pending_t pending_list[MAX_REQUESTS]; + +struct cb_param { + pending_t *pent; + int segment; + u64 sector; + u64 vblock; /* for debug printing -- can be removed. */ +}; + +static void read_cb(struct io_ret r, void *in_param) +{ + struct cb_param *param = (struct cb_param *)in_param; + pending_t *p = param->pent; + int segment = param->segment; + blkif_request_t *req = p->req; + unsigned long size, offset, start; + char *dpage, *spage; + + spage = IO_BLOCK(r); + if (spage == NULL) { p->error++; goto finish; } + dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), segment); + + /* Calculate read size and offset within the read block. */ + + offset = (param->sector << SECTOR_SHIFT) % BLOCK_SIZE; + size = ( blkif_last_sect (req->frame_and_sects[segment]) - + blkif_first_sect(req->frame_and_sects[segment]) + 1 + ) << SECTOR_SHIFT; + start = blkif_first_sect(req->frame_and_sects[segment]) + << SECTOR_SHIFT; + + DPRINTF("ParallaxRead: sect: %lld (%ld,%ld), " + "vblock %llx, " + "size %lx\n", + param->sector, blkif_first_sect(p->req->frame_and_sects[segment]), + blkif_last_sect (p->req->frame_and_sects[segment]), + param->vblock, size); + + memcpy(dpage + start, spage + offset, size); + freeblock(spage); + + /* Done the read. Now update the pending record. */ + finish: + pthread_mutex_lock(&p->mutex); + p->count--; + + if (p->count == 0) { + blkif_response_t *rsp; + + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_READ; + if (p->error == 0) { + rsp->status = BLKIF_RSP_OKAY; + } else { + rsp->status = BLKIF_RSP_ERROR; + } + blktap_inject_response(rsp); + } + + pthread_mutex_unlock(&p->mutex); + + free(param); /* TODO: replace with cached alloc/dealloc */ +} + int parallax_read(blkif_request_t *req, blkif_t *blkif) { blkif_response_t *rsp; - unsigned long size, offset, start; - u64 sector; u64 vblock, gblock; vdi_t *vdi; + u64 sector; int i; char *dpage, *spage; + pending_t *pent; vdi = blkif_get_vdi(blkif, req->device); if ( vdi == NULL ) goto err; + + pent = &pending_list[ID_TO_IDX(req->id)]; + pent->count = req->nr_segments; + pent->req = req; + pthread_mutex_init(&pent->mutex, NULL); for (i = 0; i < req->nr_segments; i++) { - - dpage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); - - /* Round the requested segment to a block address. */ - - sector = req->sector_number + (8*i); - vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; - - /* Get that block from the store. */ - - gblock = vdi_lookup_block(vdi, vblock, NULL); - - /* Calculate read size and offset within the read block. */ - - offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE; - size = ( blkif_last_sect (req->frame_and_sects[i]) - - blkif_first_sect(req->frame_and_sects[i]) + 1 - ) << SECTOR_SHIFT; - start = blkif_first_sect(req->frame_and_sects[i]) << SECTOR_SHIFT; - - /* If the block does not exist in the store, return zeros. */ - /* Otherwise, copy that region to the guest page. */ - - DPRINTF("ParallaxRead: sect: %lld (%ld,%ld), " - "vblock %llx, gblock %llx, " - "size %lx\n", - sector, blkif_first_sect(req->frame_and_sects[i]), - blkif_last_sect (req->frame_and_sects[i]), - vblock, gblock, size); - - if ( gblock == 0 ) { - - memset(dpage + start, '\0', size); - - } else { - - spage = readblock(gblock); - - if (spage == NULL) { - printf("Error reading gblock from store: %Ld\n", gblock); - goto err; - } - - memcpy(dpage + start, spage + offset, size); - - freeblock(spage); - } - - } + pthread_t tid; + int ret; + struct cb_param *p; + + /* Round the requested segment to a block address. */ + sector = req->sector_number + (8*i); + vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; + + /* TODO: Replace this call to malloc with a cached allocation */ + p = (struct cb_param *)malloc(sizeof(struct cb_param)); + p->pent = pent; + p->sector = sector; + p->segment = i; + p->vblock = vblock; /* dbg */ + + /* Get that block from the store. */ + async_read(vdi, vblock, read_cb, (void *)p); - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_READ; - rsp->status = BLKIF_RSP_OKAY; + } + + return BLKTAP_STOLEN; - return BLKTAP_RESPOND; err: rsp = (blkif_response_t *)req; rsp->id = req->id; @@ -355,6 +404,37 @@ err: return BLKTAP_RESPOND; } +static void write_cb(struct io_ret r, void *in_param) +{ + struct cb_param *param = (struct cb_param *)in_param; + pending_t *p = param->pent; + blkif_request_t *req = p->req; + + /* catch errors from the block code. */ + if (IO_INT(r) < 0) p->error++; + + pthread_mutex_lock(&p->mutex); + p->count--; + + if (p->count == 0) { + blkif_response_t *rsp; + + rsp = (blkif_response_t *)req; + rsp->id = req->id; + rsp->operation = BLKIF_OP_WRITE; + if (p->error == 0) { + rsp->status = BLKIF_RSP_OKAY; + } else { + rsp->status = BLKIF_RSP_ERROR; + } + blktap_inject_response(rsp); + } + + pthread_mutex_unlock(&p->mutex); + + free(param); /* TODO: replace with cached alloc/dealloc */ +} + int parallax_write(blkif_request_t *req, blkif_t *blkif) { blkif_response_t *rsp; @@ -364,13 +444,20 @@ int parallax_write(blkif_request_t *req, blkif_t *blkif) char *spage; unsigned long size, offset, start; vdi_t *vdi; + pending_t *pent; vdi = blkif_get_vdi(blkif, req->device); if ( vdi == NULL ) goto err; + + pent = &pending_list[ID_TO_IDX(req->id)]; + pent->count = req->nr_segments; + pent->req = req; + pthread_mutex_init(&pent->mutex, NULL); for (i = 0; i < req->nr_segments; i++) { + struct cb_param *p; spage = (char *)MMAP_VADDR(ID_TO_IDX(req->id), i); @@ -379,10 +466,6 @@ int parallax_write(blkif_request_t *req, blkif_t *blkif) sector = req->sector_number + (8*i); vblock = (sector << SECTOR_SHIFT) >> BLOCK_SHIFT; - /* Get that block from the store. */ - - gblock = vdi_lookup_block(vdi, vblock, &writable); - /* Calculate read size and offset within the read block. */ offset = (sector << SECTOR_SHIFT) % BLOCK_SIZE; @@ -405,27 +488,20 @@ int parallax_write(blkif_request_t *req, blkif_t *blkif) printf("]\n] STRANGE WRITE!\n]\n"); goto err; } - - if (( gblock == 0 ) || ( writable == 0 )) { - - gblock = allocblock(spage); - vdi_update_block(vdi, vblock, gblock); - - } else { - - /* write-in-place, no need to change mappings. */ - writeblock(gblock, spage); - - } - + + /* TODO: Replace this call to malloc with a cached allocation */ + p = (struct cb_param *)malloc(sizeof(struct cb_param)); + p->pent = pent; + p->sector = sector; + p->segment = i; + p->vblock = vblock; /* dbg */ + + /* Issue the write to the store. */ + async_write(vdi, vblock, spage, write_cb, (void *)p); } - rsp = (blkif_response_t *)req; - rsp->id = req->id; - rsp->operation = BLKIF_OP_WRITE; - rsp->status = BLKIF_RSP_OKAY; + return BLKTAP_STOLEN; - return BLKTAP_RESPOND; err: rsp = (blkif_response_t *)req; rsp->id = req->id; @@ -477,16 +553,19 @@ void __init_parallax(void) } + int main(int argc, char *argv[]) { DPRINTF("parallax: starting.\n"); __init_blockstore(); DPRINTF("parallax: initialized blockstore...\n"); + init_block_async(); + DPRINTF("parallax: initialized async blocks...\n"); __init_vdi(); DPRINTF("parallax: initialized vdi registry etc...\n"); __init_parallax(); DPRINTF("parallax: initialized local stuff..\n"); - + blktap_register_ctrl_hook("parallax_control", parallax_control); blktap_register_request_hook("parallax_request", parallax_request); DPRINTF("parallax: added ctrl + request hooks, starting listen...\n"); diff --git a/tools/blktap/radix.c b/tools/blktap/radix.c index 44aa0ac482..579df2e655 100644 --- a/tools/blktap/radix.c +++ b/tools/blktap/radix.c @@ -25,18 +25,6 @@ #define DEBUG */ -/* -#define STAGED -*/ - -#define ZERO 0LL -#define ONE 1LL -#define ONEMASK 0xffffffffffffffeLL - - -typedef u64 *radix_tree_node; - - /* Experimental radix cache. */ static pthread_mutex_t rcache_mutex = PTHREAD_MUTEX_INITIALIZER; @@ -276,7 +264,6 @@ radix_tree_node cloneblock(radix_tree_node block) { * * @return: value on success, zero on error */ -#ifndef STAGED u64 lookup(int height, u64 root, u64 key) { radix_tree_node node; @@ -318,92 +305,6 @@ u64 lookup(int height, u64 root, u64 key) { return ZERO; } -#else /* STAGED */ - - -/* non-recursive staged lookup, assume height is 35. */ -u64 lookup(int height, u64 root, u64 key) { - radix_tree_node node; - u64 mask = ONE; - -printf("lookup!\n"); - assert(key >> 35 == 0); - - /* the root block may be smaller to ensure all leaves are full */ - height = 27; - - /* now carve off equal sized chunks at each step */ - - /* ROOT: (LEVEL 0) KEYLEN=35*/ - if (getid(root) == ZERO) - return ZERO; - - node = (radix_tree_node) readblock(getid(root)); - if (node == NULL) - return ZERO; - - root = node[(key >> height) & RADIX_TREE_MAP_MASK]; - mask &= root; - freeblock(node); - - if (height == 0) - return ( root & ONEMASK ) | mask; - - height -= RADIX_TREE_MAP_SHIFT; /* == 18 */ - - /* LEVEL 1: KEYLEN=26*/ - if (getid(root) == ZERO) - return ZERO; - - node = (radix_tree_node) readblock(getid(root)); - if (node == NULL) - return ZERO; - - root = node[(key >> height) & RADIX_TREE_MAP_MASK]; - mask &= root; - freeblock(node); - - if (height == 0) - return ( root & ONEMASK ) | mask; - - height -= RADIX_TREE_MAP_SHIFT; /* == 9 */ - - /* LEVEL 2: KEYLEN=17*/ - if (getid(root) == ZERO) - return ZERO; - - node = (radix_tree_node) readblock(getid(root)); - if (node == NULL) - return ZERO; - - root = node[(key >> height) & RADIX_TREE_MAP_MASK]; - mask &= root; - freeblock(node); - - if (height == 0) - return ( root & ONEMASK ) | mask; - - height -= RADIX_TREE_MAP_SHIFT; /* == 0 */ - - /* LEVEL 3: KEYLEN=8*/ - if (getid(root) == ZERO) - return ZERO; - - node = (radix_tree_node) readblock(getid(root)); - if (node == NULL) - return ZERO; - - root = node[(key >> height) & RADIX_TREE_MAP_MASK]; - mask &= root; - freeblock(node); - - // if (height == 0) - return ( root & ONEMASK ) | mask; - -} - -#endif - /* * update: set a radix tree entry, doing copy-on-write as necessary * @height: height in bits of the radix tree @@ -414,9 +315,6 @@ printf("lookup!\n"); * @returns: (possibly new) root id on success (with LSB=1), 0 on failure */ -#ifndef STAGED - - u64 update(int height, u64 root, u64 key, u64 val) { int offset; u64 child; @@ -487,320 +385,6 @@ u64 update(int height, u64 root, u64 key, u64 val) { return root; } - -#else /* STAGED */ - -/* When update is called, state->next points to the thing to call after - * update is finished. */ - -struct cb_state_st; - -typedef struct { - /* public stuff */ - u64 val; - u64 key; - u64 result; - - /* internal state */ - u64 root[4]; - radix_tree_node node[4]; - void (*next)(struct cb_state_st *); - int err; -} radix_update_t; - -typedef struct cb_state_st{ - void (*next)(struct cb_state_st *); /* Next continuation. */ - union { - radix_update_t update; - } radix; -} cb_state_t; - -void s_readblock(cb_state_t *state, u64 id, void **ret) -{ - *ret = readblock(id); - state->next(state); -} - -void s_allocblock(cb_state_t *state, void *block, u64 *ret) -{ - *ret = allocblock(block); - state->next(state); -} - -void s_writeblock(cb_state_t *state, u64 id, void *block, int *ret) -{ - *ret = writeblock(id, block); - state->next(state); -} - -void cb_done(cb_state_t *state) -{ - printf("*** done ***\n"); -} - -/* forward prototypes. */ -void up0(cb_state_t *state); -void up1(cb_state_t *state); -void up2(cb_state_t *state); -void up3(cb_state_t *state); -void up4(cb_state_t *state); -void up5(cb_state_t *state); -void up6(cb_state_t *state); -void up7(cb_state_t *state); -void up8(cb_state_t *state); -void up9(cb_state_t *state); -void up10(cb_state_t *state); -void up11(cb_state_t *state); -void up12(cb_state_t *state); - -u64 update(int height, u64 root, u64 key, u64 val) -{ - cb_state_t state; - radix_update_t *u = &state.radix.update; - - u->val = val; - u->key = key; - u->root[0] = root; - u->root[1] = u->root[2] = u->root[3] = ZERO; - u->node[0] = u->node[1] = u->node[2] = u->node[3] = NULL; - - /* take a copy of the higher-scoped next continuation. */ - u->next = state->next; - - /* update start state */ - state->next = up0; - - for (;;) - { - state->next(state); - if (state->next == NULL) - break; - } - - return u->result; -} - -/* c0:*/ -void up0(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - state->next = up1; - s_readblock(state, getid(u->root[0]), (void **)&(u->node[0])); -} - -/* c1: */ -void up1(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - u->root[1] = u->node[0][u->key >> 27 & RADIX_TREE_MAP_MASK]; - if (u->root[1] == ZERO) { - u->node[1] = (radix_tree_node) newblock(); - /* goto next continuation (c2)*/ up2(state);return; - } else { - state->next = up2; - s_readblock(state, getid(u->root[1]), (void **)&(u->node[1])); - } -} - -/* c2: */ -void up2(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - if ((u->root[1] != ZERO) && (!iswritable(u->root[1]))) { - /* need to clone this node */ - radix_tree_node oldnode = u->node[1]; - u->node[1] = cloneblock(u->node[1]); - freeblock(oldnode); - u->root[1] = ZERO; - } - u->root[2] = u->node[1][u->key >> 18 & RADIX_TREE_MAP_MASK]; - if (u->root[2] == ZERO) { - u->node[2] = (radix_tree_node) newblock(); - /* goto next continuation (c3)*/ up3(state);return; - } else { - state->next = up3; - s_readblock(state, getid(u->root[2]), (void **)&(u->node[2])); - } -} - -/* c3: */ -void up3(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - if ((u->root[2] != ZERO) && (!iswritable(u->root[2]))) { - /* need to clone this node */ - radix_tree_node oldnode = u->node[2]; - u->node[2] = cloneblock(u->node[2]); - freeblock(oldnode); - u->root[2] = ZERO; - } - u->root[3] = u->node[2][u->key >> 9 & RADIX_TREE_MAP_MASK]; - if (u->root[3] == ZERO) { - u->node[3] = (radix_tree_node) newblock(); - /* goto next continuation (c4)*/ up4(state);return; - } else { - state->next = up4; - s_readblock(state, getid(u->root[3]), (void **)&(u->node[3])); - } -} - -/* c4: */ -void up4(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - if ((u->root[3] != ZERO) && (!iswritable(u->root[3]))) { - /* need to clone this node */ - radix_tree_node oldnode = u->node[3]; - u->node[3] = cloneblock(u->node[3]); - freeblock(oldnode); - u->root[3] = ZERO; - } - - if (u->node[3][u->key & RADIX_TREE_MAP_MASK] == u->val){ - /* no change, so we already owned the child */ - /* goto last continuation (c12) */ up12(state);return; - } - - u->node[3][u->key & RADIX_TREE_MAP_MASK] = u->val; - - /* new/cloned blocks need to be saved */ - if (u->root[3] == ZERO) { - /* mark this as an owned block */ - state->next = up5; - s_allocblock(state, u->node[3], &u->root[3]); - /* goto continuation (c5) */ return; - } else { - state->next = up6; - s_writeblock(state, getid(u->root[3]), u->node[3], &u->err); - /* goto continuation (c6) */ return; - } -} - -/* c5: */ -void up5(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - if (u->root[3]) - u->root[3] = writable(u->root[3]); - /* goto continuation (c6) */ up6(state);return; -} - -/* c6: */ -void up6(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - if (u->node[2][u->key >> 9 & RADIX_TREE_MAP_MASK] == u->root[3]){ - /* no change, so we already owned the child */ - /* goto last continuation (c12) */ up12(state);return; - } - - u->node[2][u->key >> 9 & RADIX_TREE_MAP_MASK] = u->root[3]; - - /* new/cloned blocks need to be saved */ - if (u->root[2] == ZERO) { - /* mark this as an owned block */ - state->next = up7; - s_allocblock(state, u->node[2], &u->root[2]); - /* goto continuation (c7) */return; - } else { - state->next = up8; - s_writeblock(state, getid(u->root[2]), u->node[2], &u->err); - /* goto continuation (c8) */return; - } -} - -/* c7: */ -void up7(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - if (u->root[2]) - u->root[2] = writable(u->root[2]); - /* goto continuation (c8) */ up8(state);return; -} - -/* c8: */ -void up8(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - if (u->node[1][u->key >> 18 & RADIX_TREE_MAP_MASK] == u->root[2]){ - /* no change, so we already owned the child */ - /* goto last continuation (c12) */ up12(state);return; - } - - u->node[1][u->key >> 18 & RADIX_TREE_MAP_MASK] = u->root[2]; - - /* new/cloned blocks need to be saved */ - if (u->root[1] == ZERO) { - /* mark this as an owned block */ - state->next = up9; - s_allocblock(state, u->node[1], &u->root[1]); - /* goto continuation (c9) */return; - } else { - state->next = up10; - s_writeblock(state, getid(u->root[1]), u->node[1], &u->err); - /* goto continuation (c10) */return; - } -} - -/* c9: */ -void up9(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - if (u->root[1]) - u->root[1] = writable(u->root[1]); - /* goto continuation (c10) */ up10(state);return; -} - -/* c10: */ -void up10(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - if (u->node[0][u->key >> 27 & RADIX_TREE_MAP_MASK] == u->root[1]){ - /* no change, so we already owned the child */ - /* goto last continuation (c12) */ up12(state);return; - } - - u->node[0][u->key >> 27 & RADIX_TREE_MAP_MASK] = u->root[1]; - - /* new/cloned blocks need to be saved */ - if (u->root[0] == ZERO) { - /* mark this as an owned block */ - state->next = up11; - s_allocblock(state, u->node[0], &u->root[0]); - /* goto continuation (c11) */ return; - } else { - state->next = up10; - s_writeblock(state, getid(u->root[0]), u->node[0], &u->err); - /* goto continuation (c12) */ return; - } -} - -/* c11: */ -void up11(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - if (u->root[0]) - u->root[0] = writable(u->root[0]); - /* goto continuation (c12) */ up12(state);return; -} - -/* c12: */ -void up12(cb_state_t *state) { - radix_update_t *u = &state->radix.update; - - int i; - for (i=0;i<4;i++) - if(u->node[i] != NULL) freeblock(u->node[i]); - - u->result = u->root[0]; - state->next = u->next; - - state->next(state);return; -} - -#endif - - /** * snapshot: create a snapshot * @root: old root node @@ -840,7 +424,6 @@ int collapse(int height, u64 proot, u64 croot) int i, numlinks, ret, total = 0; radix_tree_node pnode, cnode; -//printf("proot: %Ld\n", getid(proot)); if (height == 0) { height = -1; /* terminate recursion */ } else { diff --git a/tools/blktap/radix.h b/tools/blktap/radix.h index e7de3f453e..61ea2205f8 100644 --- a/tools/blktap/radix.h +++ b/tools/blktap/radix.h @@ -16,6 +16,16 @@ #define putid(x) ((x)<<1) #define writable(x) (((x)<<1)|1LL) #define iswritable(x) ((x)&1LL) +#define ZERO 0LL +#define ONE 1LL +#define ONEMASK 0xffffffffffffffeLL + +#define RADIX_TREE_MAP_SHIFT 9 +#define RADIX_TREE_MAP_MASK 0x1ff +#define RADIX_TREE_MAP_ENTRIES 512 + +typedef u64 *radix_tree_node; + /* * main api diff --git a/tools/blktap/requests-async.c b/tools/blktap/requests-async.c new file mode 100755 index 0000000000..bb2d07b60a --- /dev/null +++ b/tools/blktap/requests-async.c @@ -0,0 +1,629 @@ +/* read.c + * + * asynchronous read experiment for parallax. + */ + +#include +#include +#include +#include +#include +#include "requests-async.h" +#include "vdi.h" +#include "radix.h" + +#define L1_IDX(_a) (((_a) & 0x0000000007fc0000ULL) >> 18) +#define L2_IDX(_a) (((_a) & 0x000000000003fe00ULL) >> 9) +#define L3_IDX(_a) (((_a) & 0x00000000000001ffULL)) + + + +//#define STANDALONE + +#if 0 +#define DPRINTF(_f, _a...) printf ( _f , ## _a ) +#else +#define DPRINTF(_f, _a...) ((void)0) +#endif + + +struct io_req { + enum { IO_OP_READ, IO_OP_WRITE } op; + u64 root; + u64 vaddr; + int state; + io_cb_t cb; + void *param; + struct radix_lock *lock; + + /* internal stuff: */ + struct io_ret retval;/* holds the return while we unlock. */ + char *block; /* the block to write */ + radix_tree_node radix[3]; + u64 radix_addr[3]; +}; + +void clear_w_bits(radix_tree_node node) +{ + int i; + for (i=0; iradix[0] = req->radix[1] = req->radix[2] = NULL; + + if (req == NULL) {perror("req was NULL in async_read"); return(-1); } + + req->op = IO_OP_READ; + req->root = vdi->radix_root; + req->lock = vdi->radix_lock; + req->vaddr = vaddr; + req->cb = cb; + req->param = param; + req->state = READ_LOCKED; + + block_rlock(req->lock, L1_IDX(vaddr), read_cb, req); + + return 0; +} + + +int async_write(vdi_t *vdi, u64 vaddr, char *block, + io_cb_t cb, void *param) +{ + struct io_req *req; + + + req = (struct io_req *)malloc(sizeof (struct io_req)); + req->radix[0] = req->radix[1] = req->radix[2] = NULL; + //DPRINTF("async_write\n"); + + if (req == NULL) {perror("req was NULL in async_write"); return(-1); } + + req->op = IO_OP_WRITE; + req->root = vdi->radix_root; + req->lock = vdi->radix_lock; + req->vaddr = vaddr; + req->block = block; + req->cb = cb; + req->param = param; + req->radix_addr[L1] = getid(req->root); /* for consistency */ + req->state = WRITE_LOCKED; + + block_wlock(req->lock, L1_IDX(vaddr), write_cb, req); + + + return 0; +} + +void read_cb(struct io_ret ret, void *param) +{ + struct io_req *req = (struct io_req *)param; + radix_tree_node node; + u64 idx; + char *block; + void *req_param; + + DPRINTF("read_cb\n"); + /* get record */ + switch(req->state) { + + case READ_LOCKED: + + DPRINTF("READ_LOCKED\n"); + req->state = READ_L1; + block_read(getid(req->root), read_cb, req); + break; + + case READ_L1: /* block is the radix root */ + + DPRINTF("READ_L1\n"); + block = IO_BLOCK(ret); + if (block == NULL) goto fail; + node = (radix_tree_node) block; + idx = getid( node[L1_IDX(req->vaddr)] ); + free(block); + if ( idx == ZERO ) { + req->state = RETURN_ZERO; + block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); + } else { + req->state = READ_L2; + block_read(idx, read_cb, req); + } + break; + + case READ_L2: + + DPRINTF("READ_L2\n"); + block = IO_BLOCK(ret); + if (block == NULL) goto fail; + node = (radix_tree_node) block; + idx = getid( node[L2_IDX(req->vaddr)] ); + free(block); + if ( idx == ZERO ) { + req->state = RETURN_ZERO; + block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); + } else { + req->state = READ_L3; + block_read(idx, read_cb, req); + } + break; + + case READ_L3: + + DPRINTF("READ_L3\n"); + block = IO_BLOCK(ret); + if (block == NULL) goto fail; + node = (radix_tree_node) block; + idx = getid( node[L3_IDX(req->vaddr)] ); + free(block); + if ( idx == ZERO ) { + req->state = RETURN_ZERO; + block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); + } else { + req->state = READ_DATA; + block_read(idx, read_cb, req); + } + break; + + case READ_DATA: + + DPRINTF("READ_DATA\n"); + if (IO_BLOCK(ret) == NULL) goto fail; + req->retval = ret; + req->state = READ_UNLOCKED; + block_runlock(req->lock, L1_IDX(req->vaddr), read_cb, req); + break; + + case READ_UNLOCKED: + { + struct io_ret r; + io_cb_t cb; + DPRINTF("READ_UNLOCKED\n"); + req_param = req->param; + r = req->retval; + cb = req->cb; + free(req); + cb(r, req_param); + break; + } + + case RETURN_ZERO: + { + struct io_ret r; + io_cb_t cb; + DPRINTF("RETURN_ZERO\n"); + req_param = req->param; + cb = req->cb; + free(req); + r.type = IO_BLOCK_T; + r.u.b = newblock(); + cb(r, req_param); + break; + } + + default: + DPRINTF("*** Write: Bad state! (%d) ***\n", req->state); + goto fail; + } + + return; + + fail: + { + struct io_ret r; + io_cb_t cb; + DPRINTF("asyn_read had a read error.\n"); + req_param = req->param; + r = ret; + cb = req->cb; + free(req); + cb(r, req_param); + } + + +} + +void write_cb(struct io_ret r, void *param) +{ + struct io_req *req = (struct io_req *)param; + radix_tree_node node; + u64 a, addr; + void *req_param; + + //DPRINTF("write_cb\n"); + switch(req->state) { + + case WRITE_LOCKED: + + DPRINTF("WRITE_LOCKED (%llu)\n", L1_IDX(req->vaddr)); + req->state = READ_L1; + block_read(getid(req->root), write_cb, req); + break; + + case READ_L1: /* block is the radix root */ + + DPRINTF("READ_L1\n"); + node = (radix_tree_node) IO_BLOCK(r); + if (node == NULL) goto fail; + a = node[L1_IDX(req->vaddr)]; + addr = getid(a); + + req->radix_addr[L2] = addr; + req->radix[L1] = node; + + if ( addr == ZERO ) { + /* L1 empty subtree: */ + req->state = ALLOC_DATA_L1z; + block_alloc( req->block, write_cb, req ); + } else if ( !iswritable(a) ) { + /* L1 fault: */ + req->state = READ_L2_L1f; + block_read( addr, write_cb, req ); + } else { + req->state = READ_L2; + block_read( addr, write_cb, req ); + } + break; + + case READ_L2: + + DPRINTF("READ_L2\n"); + node = (radix_tree_node) IO_BLOCK(r); + if (node == NULL) goto fail; + a = node[L2_IDX(req->vaddr)]; + addr = getid(a); + + req->radix_addr[L3] = addr; + req->radix[L2] = node; + + if ( addr == ZERO ) { + /* L2 empty subtree: */ + req->state = ALLOC_DATA_L2z; + block_alloc( req->block, write_cb, req ); + } else if ( !iswritable(a) ) { + /* L2 fault: */ + req->state = READ_L3_L2f; + block_read( addr, write_cb, req ); + } else { + req->state = READ_L3; + block_read( addr, write_cb, req ); + } + break; + + case READ_L3: + + DPRINTF("READ_L3\n"); + node = (radix_tree_node) IO_BLOCK(r); + if (node == NULL) goto fail; + a = node[L3_IDX(req->vaddr)]; + addr = getid(a); + + req->radix[L3] = node; + + if ( addr == ZERO ) { + /* L3 fault: */ + req->state = ALLOC_DATA_L3z; + block_alloc( req->block, write_cb, req ); + } else if ( !iswritable(a) ) { + /* L3 fault: */ + req->state = ALLOC_DATA_L3f; + block_alloc( req->block, write_cb, req ); + } else { + req->state = WRITE_DATA; + block_write( addr, req->block, write_cb, req ); + } + break; + + /* L3 Zero Path: */ + + case ALLOC_DATA_L3z: + + DPRINTF("ALLOC_DATA_L3z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3][L3_IDX(req->vaddr)] = a; + req->state = WRITE_L3_L3z; + block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); + break; + + /* L3 Fault Path: */ + + case ALLOC_DATA_L3f: + + DPRINTF("ALLOC_DATA_L3f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3][L3_IDX(req->vaddr)] = a; + req->state = WRITE_L3_L3f; + block_write(req->radix_addr[L3], (char*)req->radix[L3], write_cb, req); + break; + + /* L2 Zero Path: */ + + case ALLOC_DATA_L2z: + + DPRINTF("ALLOC_DATA_L2z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3] = newblock(); + req->radix[L3][L3_IDX(req->vaddr)] = a; + req->state = ALLOC_L3_L2z; + block_alloc( (char*)req->radix[L3], write_cb, req ); + break; + + case ALLOC_L3_L2z: + + DPRINTF("ALLOC_L3_L2z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L2][L2_IDX(req->vaddr)] = a; + req->state = WRITE_L2_L2z; + block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req); + break; + + /* L2 Fault Path: */ + + case READ_L3_L2f: + + DPRINTF("READ_L3_L2f\n"); + node = (radix_tree_node) IO_BLOCK(r); + clear_w_bits(node); + if (node == NULL) goto fail; + a = node[L2_IDX(req->vaddr)]; + addr = getid(a); + + req->radix[L3] = node; + req->state = ALLOC_DATA_L2f; + block_alloc( req->block, write_cb, req ); + break; + + case ALLOC_DATA_L2f: + + DPRINTF("ALLOC_DATA_L2f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3][L3_IDX(req->vaddr)] = a; + req->state = ALLOC_L3_L2f; + block_alloc( (char*)req->radix[L3], write_cb, req ); + break; + + case ALLOC_L3_L2f: + + DPRINTF("ALLOC_L3_L2f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L2][L2_IDX(req->vaddr)] = a; + req->state = WRITE_L2_L2f; + block_write(req->radix_addr[L2], (char*)req->radix[L2], write_cb, req); + break; + + /* L1 Zero Path: */ + + case ALLOC_DATA_L1z: + + DPRINTF("ALLOC_DATA_L1z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3] = newblock(); + req->radix[L3][L3_IDX(req->vaddr)] = a; + req->state = ALLOC_L3_L1z; + block_alloc( (char*)req->radix[L3], write_cb, req ); + break; + + case ALLOC_L3_L1z: + + DPRINTF("ALLOC_L3_L1z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L2] = newblock(); + req->radix[L2][L2_IDX(req->vaddr)] = a; + req->state = ALLOC_L2_L1z; + block_alloc( (char*)req->radix[L2], write_cb, req ); + break; + + case ALLOC_L2_L1z: + + DPRINTF("ALLOC_L2_L1z\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L1][L1_IDX(req->vaddr)] = a; + req->state = WRITE_L1_L1z; + block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req); + break; + + /* L1 Fault Path: */ + + case READ_L2_L1f: + + DPRINTF("READ_L2_L1f\n"); + node = (radix_tree_node) IO_BLOCK(r); + clear_w_bits(node); + if (node == NULL) goto fail; + a = node[L2_IDX(req->vaddr)]; + addr = getid(a); + + req->radix_addr[L3] = addr; + req->radix[L2] = node; + + if (addr == ZERO) { + /* nothing below L2, create an empty L3 and alloc data. */ + /* (So skip READ_L3_L1f.) */ + req->radix[L3] = newblock(); + req->state = ALLOC_DATA_L1f; + block_alloc( req->block, write_cb, req ); + } else { + req->state = READ_L3_L1f; + block_read( addr, write_cb, req ); + } + break; + + case READ_L3_L1f: + + DPRINTF("READ_L3_L1f\n"); + node = (radix_tree_node) IO_BLOCK(r); + clear_w_bits(node); + if (node == NULL) goto fail; + a = node[L2_IDX(req->vaddr)]; + addr = getid(a); + + req->radix[L3] = node; + req->state = ALLOC_DATA_L1f; + block_alloc( req->block, write_cb, req ); + break; + + case ALLOC_DATA_L1f: + + DPRINTF("ALLOC_DATA_L1f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L3][L3_IDX(req->vaddr)] = a; + req->state = ALLOC_L3_L1f; + block_alloc( (char*)req->radix[L3], write_cb, req ); + break; + + case ALLOC_L3_L1f: + + DPRINTF("ALLOC_L3_L1f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L2][L2_IDX(req->vaddr)] = a; + req->state = ALLOC_L2_L1f; + block_alloc( (char*)req->radix[L2], write_cb, req ); + break; + + case ALLOC_L2_L1f: + + DPRINTF("ALLOC_L2_L1f\n"); + addr = IO_ADDR(r); + a = writable(addr); + req->radix[L1][L1_IDX(req->vaddr)] = a; + req->state = WRITE_L1_L1f; + block_write(req->radix_addr[L1], (char*)req->radix[L1], write_cb, req); + break; + + case WRITE_DATA: + case WRITE_L3_L3z: + case WRITE_L3_L3f: + case WRITE_L2_L2z: + case WRITE_L2_L2f: + case WRITE_L1_L1z: + case WRITE_L1_L1f: + { + int i; + DPRINTF("DONE\n"); + /* free any saved node vals. */ + for (i=0; i<3; i++) + if (req->radix[i] != 0) free(req->radix[i]); + req->retval = r; + req->state = WRITE_UNLOCKED; + block_wunlock(req->lock, L1_IDX(req->vaddr), write_cb, req); + break; + } + case WRITE_UNLOCKED: + { + struct io_ret r; + io_cb_t cb; + DPRINTF("WRITE_UNLOCKED!\n"); + req_param = req->param; + r = req->retval; + cb = req->cb; + free(req); + cb(r, req_param); + break; + } + + default: + DPRINTF("*** Write: Bad state! (%d) ***\n", req->state); + goto fail; + } + + return; + + fail: + { + struct io_ret r; + io_cb_t cb; + DPRINTF("asyn_write had a read error mid-way.\n"); + req_param = req->param; + cb = req->cb; + r.type = IO_INT_T; + r.u.i = -1; + free(req); + cb(r, req_param); + } +} + diff --git a/tools/blktap/requests-async.h b/tools/blktap/requests-async.h new file mode 100755 index 0000000000..503a543b7e --- /dev/null +++ b/tools/blktap/requests-async.h @@ -0,0 +1,19 @@ +#ifndef _REQUESTSASYNC_H_ +#define _REQUESTSASYNC_H_ + +#include "block-async.h" +#include "blockstore.h" /* for newblock etc. */ + +/* +#define BLOCK_SIZE 4096 +#define ZERO 0ULL +#define getid(x) (((x)>>1)&0x7fffffffffffffffLLU) +#define iswritable(x) (((x) & 1LLU) != 0) +#define writable(x) (((x) << 1) | 1LLU) +#define readonly(x) ((u64)((x) << 1)) +*/ + +int async_read (vdi_t *vdi, u64 vaddr, io_cb_t cb, void *param); +int async_write(vdi_t *vdi, u64 vaddr, char *block, io_cb_t cb, void *param); + +#endif //_REQUESTSASYNC_H_ diff --git a/tools/blktap/vdi.c b/tools/blktap/vdi.c index b3a84f0244..490a2e691a 100644 --- a/tools/blktap/vdi.c +++ b/tools/blktap/vdi.c @@ -11,14 +11,16 @@ #include #include #include +#include #include "blockstore.h" +#include "block-async.h" #include "radix.h" #include "vdi.h" #define VDI_REG_BLOCK 2LL #define VDI_RADIX_ROOT writable(3) -#if 1 +#if 0 #define DPRINTF(_f, _a...) printf ( _f , ## _a ) #else #define DPRINTF(_f, _a...) ((void)0) @@ -66,6 +68,7 @@ vdi_registry_t *get_vdi_registry(void) return vdi_reg; } + vdi_t *vdi_create(snap_id_t *parent_snap, char *name) { int ret; @@ -106,12 +109,22 @@ vdi_t *vdi_create(snap_id_t *parent_snap, char *name) vdi->id = vdi_reg->nr_vdis++; strncpy(vdi->name, name, VDI_NAME_SZ); vdi->name[VDI_NAME_SZ] = '\0'; + vdi->radix_lock = NULL; /* for tidiness */ writeblock(vdi->block, (void *)vdi); update(VDI_REG_HEIGHT, VDI_RADIX_ROOT, vdi->id, vdi->block); writeblock(VDI_REG_BLOCK, (void *)vdi_reg); freeblock(vdi_reg); + vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock)); + if (vdi->radix_lock == NULL) + { + perror("couldn't malloc radix_lock for new vdi!"); + freeblock(vdi); + return NULL; + } + radix_lock_init(vdi->radix_lock); + return vdi; } @@ -126,6 +139,16 @@ vdi_t *vdi_get(u64 vdi_id) return NULL; vdi = (vdi_t *)readblock(vdi_blk); + + vdi->radix_lock = (struct radix_lock *)malloc(sizeof(struct radix_lock)); + if (vdi->radix_lock == NULL) + { + perror("couldn't malloc radix_lock for new vdi!"); + freeblock(vdi); + return NULL; + } + radix_lock_init(vdi->radix_lock); + return vdi; } diff --git a/tools/blktap/vdi.h b/tools/blktap/vdi.h index dd32102dad..1c04e79393 100644 --- a/tools/blktap/vdi.h +++ b/tools/blktap/vdi.h @@ -1,3 +1,5 @@ +#ifndef _VDI_H_ +#define _VDI_H_ /************************************************************************** * * vdi.h @@ -12,11 +14,12 @@ #include "blktaplib.h" #include "snaplog.h" -#define VDI_HEIGHT 35 -#define VDI_REG_HEIGHT 35 /* why not? */ +#define VDI_HEIGHT 27 /* Note that these are now hard-coded */ +#define VDI_REG_HEIGHT 27 /* in the async lookup code */ #define VDI_NAME_SZ 256 + typedef struct vdi { u64 id; /* unique vdi id -- used by the registry */ u64 block; /* block where this vdi lives (also unique)*/ @@ -24,6 +27,7 @@ typedef struct vdi { snap_id_t snap; /* next snapshot slot for this VDI */ struct vdi *next; /* used to hash-chain in blkif. */ blkif_vdev_t vdevice; /* currently mounted as... */ + struct radix_lock *radix_lock;/* per-line L1 RW lock for parallel reqs */ char name[VDI_NAME_SZ];/* human readable vdi name */ } vdi_t; @@ -46,3 +50,5 @@ void vdi_snapshot(vdi_t *vdi); #endif /* __VDI_H__ */ + +#endif //_VDI_H_ -- 2.30.2